list.of.packages <- c("tidyverse", "reshape2", "plotly", "vcfR", "SNPRelate", "network", "janitor", "vcfR") #add new libraries here
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages)
# Load all libraries
lapply(list.of.packages, FUN = function(X) {
do.call("require", list(X))
})
load(file="../raw-data/sample.info")
sample.info.adult <- sample.info %>%
select(sample, stage, population, pCO2.parent, temp.parent) %>% filter(stage=="adult") %>%
mutate(pop_code = as.factor(paste(population, pCO2.parent, sep="-"))) %>% droplevels()
sample.info.larvae <- sample.info %>%
select(sample, stage, population, pCO2.parent) %>% filter(stage=="larvae" & sample!=571) %>%
mutate(pop_code = as.factor(paste(population, pCO2.parent, sep="-")),
sample=str_pad(sample, width=3, side="left", pad="0")) %>% droplevels()
sample.info.juvenile <- sample.info %>%
select(sample, stage, population, pCO2.parent) %>% filter(stage=="juvenile") %>%
mutate(pop_code = as.factor(paste(population, pCO2.parent, sep="-")),
sample=str_pad(sample, width=3, side="left", pad="0")) %>% droplevels()
/Applications/bioinformatics/gatk-4.1.9.0/gatk VariantFiltration \
-R ../references/Olurida_v081_concat_rehead.fa \
-V ../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-adult.vcf.gz \
-O ../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-adult-filtered.vcf.gz \
--filter-name "FS" \
--filter "FS > 60.0" \
--filter-name "QUAL30" \
--filter "QUAL < 30.0" \
--filter-name "SOR3" \
--filter "SOR > 3.0" \
--filter-name "DP50" \
--filter "DP < 50" \
--filter-name "DP450" \
--filter "DP > 450"
## Using GATK jar /Applications/bioinformatics/gatk-4.1.9.0/gatk-package-4.1.9.0-local.jar
## Running:
## java -Dsamjdk.use_async_io_read_samtools=false -Dsamjdk.use_async_io_write_samtools=true -Dsamjdk.use_async_io_write_tribble=false -Dsamjdk.compression_level=2 -jar /Applications/bioinformatics/gatk-4.1.9.0/gatk-package-4.1.9.0-local.jar VariantFiltration -R ../references/Olurida_v081_concat_rehead.fa -V ../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-adult.vcf.gz -O ../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-adult-filtered.vcf.gz --filter-name FS --filter FS > 60.0 --filter-name QUAL30 --filter QUAL < 30.0 --filter-name SOR3 --filter SOR > 3.0 --filter-name DP50 --filter DP < 50 --filter-name DP450 --filter DP > 450
## 20:23:45.707 INFO NativeLibraryLoader - Loading libgkl_compression.dylib from jar:file:/Applications/bioinformatics/gatk-4.1.9.0/gatk-package-4.1.9.0-local.jar!/com/intel/gkl/native/libgkl_compression.dylib
## Apr 20, 2021 8:23:46 PM shaded.cloud_nio.com.google.auth.oauth2.ComputeEngineCredentials runningOnComputeEngine
## INFO: Failed to detect whether we are running on Google Compute Engine.
## 20:23:46.203 INFO VariantFiltration - ------------------------------------------------------------
## 20:23:46.203 INFO VariantFiltration - The Genome Analysis Toolkit (GATK) v4.1.9.0
## 20:23:46.203 INFO VariantFiltration - For support and documentation go to https://software.broadinstitute.org/gatk/
## 20:23:46.203 INFO VariantFiltration - Executing as laura@lauras-MacBook-Pro-2.local on Mac OS X v10.15.7 x86_64
## 20:23:46.203 INFO VariantFiltration - Java runtime: Java HotSpot(TM) 64-Bit Server VM v11.0.1+13-LTS
## 20:23:46.204 INFO VariantFiltration - Start Date/Time: April 20, 2021 at 8:23:45 PM PDT
## 20:23:46.204 INFO VariantFiltration - ------------------------------------------------------------
## 20:23:46.204 INFO VariantFiltration - ------------------------------------------------------------
## 20:23:46.204 INFO VariantFiltration - HTSJDK Version: 2.23.0
## 20:23:46.204 INFO VariantFiltration - Picard Version: 2.23.3
## 20:23:46.204 INFO VariantFiltration - HTSJDK Defaults.COMPRESSION_LEVEL : 2
## 20:23:46.204 INFO VariantFiltration - HTSJDK Defaults.USE_ASYNC_IO_READ_FOR_SAMTOOLS : false
## 20:23:46.204 INFO VariantFiltration - HTSJDK Defaults.USE_ASYNC_IO_WRITE_FOR_SAMTOOLS : true
## 20:23:46.204 INFO VariantFiltration - HTSJDK Defaults.USE_ASYNC_IO_WRITE_FOR_TRIBBLE : false
## 20:23:46.204 INFO VariantFiltration - Deflater: IntelDeflater
## 20:23:46.204 INFO VariantFiltration - Inflater: IntelInflater
## 20:23:46.205 INFO VariantFiltration - GCS max retries/reopens: 20
## 20:23:46.205 INFO VariantFiltration - Requester pays: disabled
## 20:23:46.205 INFO VariantFiltration - Initializing engine
## 20:23:46.333 INFO FeatureManager - Using codec VCFCodec to read file file:///Volumes/Bumblebee/O.lurida_QuantSeq-2020/notebooks/../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-adult.vcf.gz
## 20:23:46.445 INFO VariantFiltration - Done initializing engine
## 20:23:46.508 INFO ProgressMeter - Starting traversal
## 20:23:46.508 INFO ProgressMeter - Current Locus Elapsed Minutes Variants Processed Variants/Minute
## 20:23:56.520 INFO ProgressMeter - Contig41068_47193:2790524 0.2 557000 3337994.4
## 20:24:02.134 INFO ProgressMeter - Contig98527_104152:29794069 0.3 924617 3550302.1
## 20:24:02.135 INFO ProgressMeter - Traversal complete. Processed 924617 total variants in 0.3 minutes.
## 20:24:02.201 INFO VariantFiltration - Shutting down engine
## [April 20, 2021 at 8:24:02 PM PDT] org.broadinstitute.hellbender.tools.walkers.filters.VariantFiltration done. Elapsed time: 0.28 minutes.
## Runtime.totalMemory()=322961408
/Applications/bioinformatics/gatk-4.1.9.0/gatk SelectVariants \
-R ../references/Olurida_v081_concat_rehead.fa \
-V ../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-adult-filtered.vcf.gz \
--exclude-filtered TRUE \
--select-type-to-include SNP \
-O ../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-adult-filtered-true.vcf.gz
## Using GATK jar /Applications/bioinformatics/gatk-4.1.9.0/gatk-package-4.1.9.0-local.jar
## Running:
## java -Dsamjdk.use_async_io_read_samtools=false -Dsamjdk.use_async_io_write_samtools=true -Dsamjdk.use_async_io_write_tribble=false -Dsamjdk.compression_level=2 -jar /Applications/bioinformatics/gatk-4.1.9.0/gatk-package-4.1.9.0-local.jar SelectVariants -R ../references/Olurida_v081_concat_rehead.fa -V ../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-adult-filtered.vcf.gz --exclude-filtered TRUE --select-type-to-include SNP -O ../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-adult-filtered-true.vcf.gz
## 20:24:04.169 INFO NativeLibraryLoader - Loading libgkl_compression.dylib from jar:file:/Applications/bioinformatics/gatk-4.1.9.0/gatk-package-4.1.9.0-local.jar!/com/intel/gkl/native/libgkl_compression.dylib
## Apr 20, 2021 8:24:04 PM shaded.cloud_nio.com.google.auth.oauth2.ComputeEngineCredentials runningOnComputeEngine
## INFO: Failed to detect whether we are running on Google Compute Engine.
## 20:24:04.431 INFO SelectVariants - ------------------------------------------------------------
## 20:24:04.432 INFO SelectVariants - The Genome Analysis Toolkit (GATK) v4.1.9.0
## 20:24:04.432 INFO SelectVariants - For support and documentation go to https://software.broadinstitute.org/gatk/
## 20:24:04.432 INFO SelectVariants - Executing as laura@lauras-MacBook-Pro-2.local on Mac OS X v10.15.7 x86_64
## 20:24:04.432 INFO SelectVariants - Java runtime: Java HotSpot(TM) 64-Bit Server VM v11.0.1+13-LTS
## 20:24:04.432 INFO SelectVariants - Start Date/Time: April 20, 2021 at 8:24:04 PM PDT
## 20:24:04.432 INFO SelectVariants - ------------------------------------------------------------
## 20:24:04.432 INFO SelectVariants - ------------------------------------------------------------
## 20:24:04.432 INFO SelectVariants - HTSJDK Version: 2.23.0
## 20:24:04.432 INFO SelectVariants - Picard Version: 2.23.3
## 20:24:04.433 INFO SelectVariants - HTSJDK Defaults.COMPRESSION_LEVEL : 2
## 20:24:04.433 INFO SelectVariants - HTSJDK Defaults.USE_ASYNC_IO_READ_FOR_SAMTOOLS : false
## 20:24:04.433 INFO SelectVariants - HTSJDK Defaults.USE_ASYNC_IO_WRITE_FOR_SAMTOOLS : true
## 20:24:04.433 INFO SelectVariants - HTSJDK Defaults.USE_ASYNC_IO_WRITE_FOR_TRIBBLE : false
## 20:24:04.433 INFO SelectVariants - Deflater: IntelDeflater
## 20:24:04.433 INFO SelectVariants - Inflater: IntelInflater
## 20:24:04.433 INFO SelectVariants - GCS max retries/reopens: 20
## 20:24:04.433 INFO SelectVariants - Requester pays: disabled
## 20:24:04.433 INFO SelectVariants - Initializing engine
## 20:24:04.558 INFO FeatureManager - Using codec VCFCodec to read file file:///Volumes/Bumblebee/O.lurida_QuantSeq-2020/notebooks/../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-adult-filtered.vcf.gz
## 20:24:04.652 INFO SelectVariants - Done initializing engine
## 20:24:04.683 INFO ProgressMeter - Starting traversal
## 20:24:04.683 INFO ProgressMeter - Current Locus Elapsed Minutes Variants Processed Variants/Minute
## 20:24:14.723 INFO ProgressMeter - Contig11179_21262:74087300 0.2 167000 998008.0
## 20:24:24.752 INFO ProgressMeter - Contig28136_34745:69884028 0.3 375000 1121188.0
## 20:24:34.768 INFO ProgressMeter - Contig53181_59083:51903313 0.5 589000 1174671.8
## 20:24:42.082 INFO ProgressMeter - Contig98527_104152:29059358 0.6 747969 1199982.4
## 20:24:42.082 INFO ProgressMeter - Traversal complete. Processed 747969 total variants in 0.6 minutes.
## 20:24:42.132 INFO SelectVariants - Shutting down engine
## [April 20, 2021 at 8:24:42 PM PDT] org.broadinstitute.hellbender.tools.walkers.variantutils.SelectVariants done. Elapsed time: 0.63 minutes.
## Runtime.totalMemory()=387973120
#snpgdsClose(genofile.adult)
vcf.fn.adult <- "../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-adult-filtered-true.vcf.gz"
snpgdsVCF2GDS(vcf.fn.adult, "../qc-processing/gatk/genotypes-adult.gds", method="biallelic.only")
## Start file conversion from VCF to SNP GDS ...
## Method: exacting biallelic SNPs
## Number of samples: 53
## Parsing "../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-adult-filtered-true.vcf.gz" ...
## import 175243 variants.
## + genotype { Bit2 53x175243, 2.2M } *
## Optimize the access efficiency ...
## Clean up the fragments of GDS file:
## open the file '../qc-processing/gatk/genotypes-adult.gds' (3.1M)
## # of fragments: 68
## save to '../qc-processing/gatk/genotypes-adult.gds.tmp'
## rename '../qc-processing/gatk/genotypes-adult.gds.tmp' (3.1M, reduced: 576B)
## # of fragments: 20
snpgdsSummary("../qc-processing/gatk/genotypes-adult.gds")
## The file name: /Volumes/Bumblebee/O.lurida_QuantSeq-2020/qc-processing/gatk/genotypes-adult.gds
## The total number of samples: 53
## The total number of SNPs: 175243
## SNP genotypes are stored in SNP-major mode (Sample X SNP).
(genofile.adult <- snpgdsOpen("../qc-processing/gatk/genotypes-adult.gds"))
## File: /Volumes/Bumblebee/O.lurida_QuantSeq-2020/qc-processing/gatk/genotypes-adult.gds (3.1M)
## + [ ] *
## |--+ sample.id { Str8 53 LZMA_ra(76.4%), 169B }
## |--+ snp.id { Int32 175243 LZMA_ra(6.47%), 44.3K }
## |--+ snp.rs.id { Str8 175243 LZMA_ra(0.10%), 181B }
## |--+ snp.position { Int32 175243 LZMA_ra(40.8%), 279.2K }
## |--+ snp.chromosome { Str8 175243 LZMA_ra(0.03%), 973B }
## |--+ snp.allele { Str8 175243 LZMA_ra(11.9%), 81.7K }
## |--+ genotype { Bit2 53x175243, 2.2M } *
## \--+ snp.annot [ ]
## |--+ qual { Float32 175243 LZMA_ra(65.4%), 448.0K }
## \--+ filter { Str8 175243 LZMA_ra(0.03%), 285B }
snpset.adult <- snpgdsLDpruning(genofile.adult, maf=.1, missing.rate=0, autosome.only=FALSE)
## SNP pruning based on LD:
## Excluding 174,670 SNPs (monomorphic: TRUE, MAF: 0.1, missing rate: 0)
## # of samples: 53
## # of SNPs: 573
## using 1 thread
## sliding window: 500,000 basepairs, Inf SNPs
## |LD| threshold: 0.2
## method: composite
## Chromosome Contig0_5313: 0.23%, 50/21,863
## Chromosome Contig104153_109739: 0.13%, 3/2,360
## Chromosome Contig109741_116186: 0.27%, 5/1,861
## Chromosome Contig11179_21262: 0.29%, 46/15,668
## Chromosome Contig116188_123130: 0.11%, 2/1,794
## Chromosome Contig123131_129982: 0.22%, 4/1,787
## Chromosome Contig129983_136896: 0.22%, 4/1,798
## Chromosome Contig136897_143777: 0.16%, 3/1,887
## Chromosome Contig143780_150710: 0.06%, 1/1,746
## Chromosome Contig150711_166372: 0.40%, 6/1,485
## Chromosome Contig166374_181896: 0.21%, 4/1,875
## Chromosome Contig181898_197428: 0.20%, 3/1,516
## Chromosome Contig197431_213218: 0.06%, 1/1,777
## Chromosome Contig21263_28135: 0.29%, 53/18,170
## Chromosome Contig213221_398012: 0.38%, 7/1,819
## Chromosome Contig28136_34745: 0.26%, 41/15,929
## Chromosome Contig34748_41067: 0.40%, 54/13,582
## Chromosome Contig398123_696856: 0.29%, 5/1,721
## Chromosome Contig41068_47193: 0.25%, 28/11,130
## Chromosome Contig47194_53180: 0.31%, 28/9,167
## Chromosome Contig5314_11178: 0.17%, 9/5,362
## Chromosome Contig53181_59083: 0.22%, 17/7,588
## Chromosome Contig59084_64831: 0.22%, 14/6,364
## Chromosome Contig64832_70524: 0.28%, 16/5,633
## Chromosome Contig70525_76135: 0.28%, 13/4,689
## Chromosome Contig76136_81765: 0.17%, 7/4,040
## Chromosome Contig81766_87352: 0.24%, 10/4,163
## Chromosome Contig87353_92929: 0.29%, 8/2,773
## Chromosome Contig92931_98526: 0.22%, 6/2,712
## Chromosome Contig98527_104152: 0.27%, 8/2,984
## 456 markers are selected in total.
snpset.adult.id <- unlist(unname(snpset.adult))
# PCA
pca.adult <- snpgdsPCA(genofile.adult, snp.id=snpset.adult.id, num.thread=2, autosome.only=FALSE)
## Principal Component Analysis (PCA) on genotypes:
## Excluding 0 SNP (monomorphic: TRUE, MAF: NaN, missing rate: NaN)
## # of samples: 53
## # of SNPs: 456
## using 2 threads
## # of principal components: 32
## PCA: the sum of all selected genotypes (0,1,2) = 37200
## CPU capabilities: Double-Precision SSE2
## Tue Apr 20 20:24:49 2021 (internal increment: 19088)
##
[..................................................] 0%, ETC: ---
[==================================================] 100%, completed, 0s
## Tue Apr 20 20:24:49 2021 Begin (eigenvalues and eigenvectors)
## Tue Apr 20 20:24:49 2021 Done.
pc.percent.adult <- pca.adult$varprop*100
head(round(pc.percent.adult, 2))
## [1] 9.47 6.28 5.59 4.66 4.53 4.11
tab.adult <- data.frame(sample.id = pca.adult$sample.id,
EV1 = pca.adult$eigenvect[,1], # the first eigenvector
EV2 = pca.adult$eigenvect[,2], # the second eigenvector
EV3 = pca.adult$eigenvect[,3], # the third eigenvector
EV4 = pca.adult$eigenvect[,4], # the fourth eigenvector
EV5 = pca.adult$eigenvect[,5], # the fifth eigenvector
stringsAsFactors = FALSE)
tab.adult.annot <- left_join(tab.adult, sample.info.adult, by=c("sample.id"="sample")) %>% droplevels()
group.adult <- as.factor(tab.adult.annot$pop_code)
#ggplotly(
ggplot(tab.adult.annot, aes(x=EV2, y=EV1, col=population, shape=pCO2.parent)) +
geom_point(size=4) + theme_minimal() + ggtitle("Adult Genetic Structure, PC1xPC2")#)
ggplotly(
ggplot(tab.adult.annot, aes(x=EV3, y=EV1, col=population, shape=pCO2.parent)) +
geom_point(size=4) + theme_minimal() + ggtitle("Adult Genetic Structure, PC1xPC3"))
ggplotly(
ggplot(tab.adult.annot, aes(x=EV4, y=EV1, col=population, shape=pCO2.parent)) +
geom_point(size=4) + theme_minimal() + ggtitle("Adult Genetic Structure, PC1xPC4"))
ggplotly(
ggplot(tab.adult.annot, aes(x=EV5, y=EV1, col=population, shape=pCO2.parent)) +
geom_point(size=4) + theme_minimal() + ggtitle("Adult Genetic Structure, PC1xPC5"))
# To perform cluster analysis on the matrix of genome-wide IBS pairwise distances,
# and determine the groups by a permutation score.
# Result= only 1 group (?)
ibs.hc.adult <- snpgdsHCluster(snpgdsIBS(genofile.adult, snp.id=snpset.adult.id, num.thread=2, autosome.only=FALSE))
## Identity-By-State (IBS) analysis on genotypes:
## Excluding 0 SNP (monomorphic: TRUE, MAF: NaN, missing rate: NaN)
## # of samples: 53
## # of SNPs: 456
## using 2 threads
## IBS: the sum of all selected genotypes (0,1,2) = 37200
## Tue Apr 20 20:24:50 2021 (internal increment: 65536)
##
[..................................................] 0%, ETC: ---
[==================================================] 100%, completed, 0s
## Tue Apr 20 20:24:50 2021 Done.
rv.adult <- snpgdsCutTree(ibs.hc.adult)
## Determine groups by permutation (Z threshold: 15, outlier threshold: 5):
## Create 1 groups.
plot(rv.adult$dendrogram, leaflab="none", main="HapMap Phase II")
# Determine groups of individuals by population information
rv2.adult <- snpgdsCutTree(ibs.hc.adult, samp.group=as.factor(tab.adult.annot$pop_code))
## Create 6 groups.
plot(rv2.adult$dendrogram, leaflab="none", main="HapMap Phase II")
legend("bottom", legend=levels(group.adult), col=1:nlevels(group.adult), pch=19, ncol=4, cex=0.6, pt.cex=1)
AF.adult.FB.high <- snpgdsSNPRateFreq(genofile.adult, snp.id=snpset.adult.id,
sample.id=subset(tab.adult.annot, population=="Fidalgo Bay" & pCO2.parent=="High")$sample.id,
with.id = TRUE)
AF.adult.FB.amb <- snpgdsSNPRateFreq(genofile.adult, snp.id=snpset.adult.id,
sample.id=subset(tab.adult.annot, population=="Fidalgo Bay" & pCO2.parent=="Ambient")$sample.id,
with.id = TRUE)
AF.adult.DB.high <- snpgdsSNPRateFreq(genofile.adult, snp.id=snpset.adult.id,
sample.id=subset(tab.adult.annot, population=="Dabob Bay" & pCO2.parent=="High")$sample.id,
with.id = TRUE)
AF.adult.DB.amb <- snpgdsSNPRateFreq(genofile.adult, snp.id=snpset.adult.id,
sample.id=subset(tab.adult.annot, population=="Dabob Bay" & pCO2.parent=="Ambient")$sample.id,
with.id = TRUE)
AF.adult.OB1.high <- snpgdsSNPRateFreq(genofile.adult, snp.id=snpset.adult.id,
sample.id=subset(tab.adult.annot, population=="Oyster Bay C1" & pCO2.parent=="High")$sample.id,
with.id = TRUE)
AF.adult.OB1.amb <- snpgdsSNPRateFreq(genofile.adult, snp.id=snpset.adult.id,
sample.id=subset(tab.adult.annot, population=="Oyster Bay C1" & pCO2.parent=="Ambient")$sample.id,
with.id = TRUE)
AF.adults <- rbind(do.call(cbind.data.frame, AF.adult.FB.high[c("snp.id", "AlleleFreq")]) %>% mutate(population=as.factor("FB"), stage=as.factor("adult"), pCO2=as.factor("high")),
do.call(cbind.data.frame, AF.adult.FB.amb[c("snp.id", "AlleleFreq")]) %>% mutate(population=as.factor("FB"), stage=as.factor("adult"), pCO2=as.factor("ambient")),
do.call(cbind.data.frame, AF.adult.DB.high[c("snp.id", "AlleleFreq")]) %>% mutate(population=as.factor("DB"), stage=as.factor("adult"), pCO2=as.factor("high")),
do.call(cbind.data.frame, AF.adult.DB.amb[c("snp.id", "AlleleFreq")]) %>% mutate(population=as.factor("DB"), stage=as.factor("adult"), pCO2=as.factor("ambient")),
do.call(cbind.data.frame, AF.adult.OB1.high[c("snp.id", "AlleleFreq")]) %>% mutate(population=as.factor("OB1"), stage=as.factor("adult"), pCO2=as.factor("high")),
do.call(cbind.data.frame, AF.adult.OB1.amb[c("snp.id", "AlleleFreq")]) %>% mutate(population=as.factor("OB1"), stage=as.factor("adult"), pCO2=as.factor("ambient")))
ggplot(AF.adults, aes(x=pCO2, y=AlleleFreq, fill=population, alpha=pCO2)) + geom_violin(trim = F) + facet_wrap(~population) + theme_minimal() +scale_alpha_discrete(range = c(0.35, 0.9)) +
stat_summary(fun.y=mean, geom="point", shape=8, size=4, color="black", fill="black") + xlab("pCO2 exposure") + ylab("Allele Frequency") +
ggtitle(paste("SNP Allele Frequency, Adults by population & pCO2 treatment\n", "(No. loci = ", length(snpset.adult.id), ")", sep="")) +
geom_text(data=AF.adults %>% group_by(population, pCO2) %>% summarize(mean=mean(AlleleFreq)), aes(x=pCO2, y=mean+.1, label=round(mean, 2)))
## Warning: Using alpha for a discrete variable is not advised.
## Warning: `fun.y` is deprecated. Use `fun` instead.
## `summarise()` has grouped output by 'population'. You can override using the `.groups` argument.
## Adult Fst Calculations
# within FB (comparing pCO2 exposure):
Fst.adult.FB <- snpgdsFst(genofile.adult, snp.id=snpset.adult.id,
sample.id=subset(tab.adult.annot, population=="Fidalgo Bay")$sample.id,
population=as.factor(subset(tab.adult.annot, population=="Fidalgo Bay")$pCO2.parent),
method="W&C84", autosome.only=FALSE, with.id = TRUE)
## Fst estimation on genotypes:
## Excluding 21 SNPs (monomorphic: TRUE, MAF: NaN, missing rate: NaN)
## # of samples: 18
## # of SNPs: 435
## Method: Weir & Cockerham, 1984
## # of Populations: 2
## Ambient (9), High (9)
paste("Weir and Cockerham mean Fst estimate, Fidalgo Bay Ambient vs. High: ", round(Fst.adult.FB$MeanFst, 5))
## [1] "Weir and Cockerham mean Fst estimate, Fidalgo Bay Ambient vs. High: 0.01504"
hist(Fst.adult.FB$FstSNP, breaks = 75, main="Fst distribution\nFidalgo Bay Ambient vs. High, Adults", col="dodgerblue3")
# within DB (comparing pCO2 exposure):
Fst.adult.DB <- snpgdsFst(genofile.adult, snp.id=snpset.adult.id,
sample.id=subset(tab.adult.annot, population=="Dabob Bay")$sample.id,
population=as.factor(subset(tab.adult.annot, population=="Dabob Bay")$pCO2.parent),
method="W&C84", autosome.only=FALSE, with.id = TRUE)
## Fst estimation on genotypes:
## Excluding 32 SNPs (monomorphic: TRUE, MAF: NaN, missing rate: NaN)
## # of samples: 17
## # of SNPs: 424
## Method: Weir & Cockerham, 1984
## # of Populations: 2
## Ambient (9), High (8)
paste("Weir and Cockerham mean Fst estimate, Dabob Bay Ambient vs. High: ", round(Fst.adult.DB$MeanFst, 5))
## [1] "Weir and Cockerham mean Fst estimate, Dabob Bay Ambient vs. High: 0.01595"
hist(Fst.adult.DB$FstSNP, breaks = 100, main="Fst distribution\nDabob Bay Ambient vs. High, Adults", col="darkseagreen3")
# within OB1 (comparing pCO2 exposure):
Fst.adult.OB1 <- snpgdsFst(genofile.adult, snp.id=snpset.adult.id,
sample.id=subset(tab.adult.annot, population=="Oyster Bay C1")$sample.id,
population=as.factor(subset(tab.adult.annot, population=="Oyster Bay C1")$pCO2.parent),
method="W&C84", autosome.only=FALSE, with.id = TRUE)
## Fst estimation on genotypes:
## Excluding 36 SNPs (monomorphic: TRUE, MAF: NaN, missing rate: NaN)
## # of samples: 18
## # of SNPs: 420
## Method: Weir & Cockerham, 1984
## # of Populations: 2
## Ambient (9), High (9)
paste("Weir and Cockerham mean Fst estimate, Oyster Bay C1 Ambient vs. High: ", round(Fst.adult.OB1$MeanFst, 5))
## [1] "Weir and Cockerham mean Fst estimate, Oyster Bay C1 Ambient vs. High: -0.01404"
hist(Fst.adult.OB1$FstSNP, breaks = 100, main="Fst distribution\nOyster Bay C1 Ambient vs. High, Adults", col="salmon2")
Fst.adults <- rbind(do.call(cbind.data.frame, Fst.adult.FB[c("snp.id", "FstSNP")]) %>% mutate(population=as.factor("FB"), stage=as.factor("adult")),
do.call(cbind.data.frame, Fst.adult.DB[c("snp.id", "FstSNP")]) %>% mutate(population=as.factor("DB"), stage=as.factor("adult")),
do.call(cbind.data.frame, Fst.adult.OB1[c("snp.id", "FstSNP")]) %>% mutate(population=as.factor("OB1"), stage=as.factor("adult")))
ggplot(Fst.adults, aes(x=population, y=FstSNP, fill=population)) + geom_violin(trim = F) +
stat_summary(fun.y=mean, geom="point", shape=8, size=3.5, color="black", fill="black") +
ggtitle(paste("SNP Fst, Adults by pCO2 treatment\n", "(No. loci = ", length(snpset.adult.id), ")", sep="")) +
geom_text(data=Fst.adults %>% group_by(population) %>% summarize(mean=mean(FstSNP)), aes(x=population, y=.65, label=round(mean, 3)))
## Warning: `fun.y` is deprecated. Use `fun` instead.
## Adult Identity-by-state
lbls <- paste("PC", 1:4, "\n", format(pc.percent.adult[1:4], digits=2), "%", sep="")
pairs(pca.adult$eigenvect[,1:4], col=as.color(tab.adult.annot$pCO2.parent), labels=lbls, main="colors = pCO2 only")
pairs(pca.adult$eigenvect[,1:4], col=as.color(tab.adult.annot$pop_code), labels=lbls, main="colors = pop & pCO2")
ibs.adult <- snpgdsIBS(genofile.adult, snp.id=snpset.adult.id, num.thread=2, autosome.only=FALSE)
## Identity-By-State (IBS) analysis on genotypes:
## Excluding 0 SNP (monomorphic: TRUE, MAF: NaN, missing rate: NaN)
## # of samples: 53
## # of SNPs: 456
## using 2 threads
## IBS: the sum of all selected genotypes (0,1,2) = 37200
## Tue Apr 20 20:24:52 2021 (internal increment: 65536)
##
[..................................................] 0%, ETC: ---
[==================================================] 100%, completed, 0s
## Tue Apr 20 20:24:52 2021 Done.
pop.idx.adult <- order(tab.adult.annot$pop_code)
#par(xpd=TRUE)
#heatmap(ibs.adult$ibs, col=terrain.colors(16), labCol=TRUE,
# RowSideColors=c("black","red","green","blue")[tab.adult.annot$pop_code])
#legend(0,-.05, legend=levels(tab.adult.annot$pop_code), pch="o", col=1:nlevels(tab.adult.annot$pop_code),
# text.col=1:nlevels(tab.adult.annot$pop_code), cex=.75, ncol=4)
#Perform multidimensional scaling analysis on the matrix of genome-wide IBS pairwise distances:
loc.adult <- cmdscale(1 - ibs.adult$ibs, k = 2)
x.adult <- loc.adult[, 1]; y.adult <- loc.adult[, 2]
plot(x.adult, y.adult, col=group.adult, xlab = "", ylab = "",
main = "Multidimensional Scaling Analysis (IBS)", cex=2, pch=16)
text(x.adult, y.adult+.01, labels=tab.adult.annot$sample.id, cex=0.7, font=2, col=as.integer(group.adult))
legend("bottom", legend=levels(group.adult), pch="o", text.col=1:nlevels(group.adult))
/Applications/bioinformatics/gatk-4.1.9.0/gatk VariantFiltration \
-R ../references/Olurida_v081_concat_rehead.fa \
-V ../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-larvae.vcf.gz \
-O ../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-larvae-filtered.vcf.gz \
--filter-name "FS" \
--filter "FS > 60.0" \
--filter-name "QUAL30" \
--filter "QUAL < 30.0" \
--filter-name "SOR3" \
--filter "SOR > 3.0" \
--filter-name "DP50" \
--filter "DP < 50" \
--filter-name "DP450" \
--filter "DP > 450"
## Using GATK jar /Applications/bioinformatics/gatk-4.1.9.0/gatk-package-4.1.9.0-local.jar
## Running:
## java -Dsamjdk.use_async_io_read_samtools=false -Dsamjdk.use_async_io_write_samtools=true -Dsamjdk.use_async_io_write_tribble=false -Dsamjdk.compression_level=2 -jar /Applications/bioinformatics/gatk-4.1.9.0/gatk-package-4.1.9.0-local.jar VariantFiltration -R ../references/Olurida_v081_concat_rehead.fa -V ../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-larvae.vcf.gz -O ../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-larvae-filtered.vcf.gz --filter-name FS --filter FS > 60.0 --filter-name QUAL30 --filter QUAL < 30.0 --filter-name SOR3 --filter SOR > 3.0 --filter-name DP50 --filter DP < 50 --filter-name DP450 --filter DP > 450
## 20:24:53.986 INFO NativeLibraryLoader - Loading libgkl_compression.dylib from jar:file:/Applications/bioinformatics/gatk-4.1.9.0/gatk-package-4.1.9.0-local.jar!/com/intel/gkl/native/libgkl_compression.dylib
## Apr 20, 2021 8:24:54 PM shaded.cloud_nio.com.google.auth.oauth2.ComputeEngineCredentials runningOnComputeEngine
## INFO: Failed to detect whether we are running on Google Compute Engine.
## 20:24:54.385 INFO VariantFiltration - ------------------------------------------------------------
## 20:24:54.386 INFO VariantFiltration - The Genome Analysis Toolkit (GATK) v4.1.9.0
## 20:24:54.386 INFO VariantFiltration - For support and documentation go to https://software.broadinstitute.org/gatk/
## 20:24:54.386 INFO VariantFiltration - Executing as laura@lauras-MacBook-Pro-2.local on Mac OS X v10.15.7 x86_64
## 20:24:54.386 INFO VariantFiltration - Java runtime: Java HotSpot(TM) 64-Bit Server VM v11.0.1+13-LTS
## 20:24:54.386 INFO VariantFiltration - Start Date/Time: April 20, 2021 at 8:24:53 PM PDT
## 20:24:54.386 INFO VariantFiltration - ------------------------------------------------------------
## 20:24:54.386 INFO VariantFiltration - ------------------------------------------------------------
## 20:24:54.386 INFO VariantFiltration - HTSJDK Version: 2.23.0
## 20:24:54.386 INFO VariantFiltration - Picard Version: 2.23.3
## 20:24:54.386 INFO VariantFiltration - HTSJDK Defaults.COMPRESSION_LEVEL : 2
## 20:24:54.386 INFO VariantFiltration - HTSJDK Defaults.USE_ASYNC_IO_READ_FOR_SAMTOOLS : false
## 20:24:54.387 INFO VariantFiltration - HTSJDK Defaults.USE_ASYNC_IO_WRITE_FOR_SAMTOOLS : true
## 20:24:54.387 INFO VariantFiltration - HTSJDK Defaults.USE_ASYNC_IO_WRITE_FOR_TRIBBLE : false
## 20:24:54.387 INFO VariantFiltration - Deflater: IntelDeflater
## 20:24:54.387 INFO VariantFiltration - Inflater: IntelInflater
## 20:24:54.387 INFO VariantFiltration - GCS max retries/reopens: 20
## 20:24:54.387 INFO VariantFiltration - Requester pays: disabled
## 20:24:54.387 INFO VariantFiltration - Initializing engine
## 20:24:54.499 INFO FeatureManager - Using codec VCFCodec to read file file:///Volumes/Bumblebee/O.lurida_QuantSeq-2020/notebooks/../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-larvae.vcf.gz
## 20:24:54.577 INFO VariantFiltration - Done initializing engine
## 20:24:54.640 INFO ProgressMeter - Starting traversal
## 20:24:54.640 INFO ProgressMeter - Current Locus Elapsed Minutes Variants Processed Variants/Minute
## 20:25:04.650 INFO ProgressMeter - Contig64832_70524:46356988 0.2 485000 2907092.9
## 20:25:05.885 INFO ProgressMeter - Contig98527_104152:30619157 0.2 555097 2961833.7
## 20:25:05.885 INFO ProgressMeter - Traversal complete. Processed 555097 total variants in 0.2 minutes.
## 20:25:05.947 INFO VariantFiltration - Shutting down engine
## [April 20, 2021 at 8:25:05 PM PDT] org.broadinstitute.hellbender.tools.walkers.filters.VariantFiltration done. Elapsed time: 0.20 minutes.
## Runtime.totalMemory()=387973120
/Applications/bioinformatics/gatk-4.1.9.0/gatk SelectVariants \
-R ../references/Olurida_v081_concat_rehead.fa \
-V ../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-larvae-filtered.vcf.gz \
--exclude-filtered TRUE \
--select-type-to-include SNP \
-O ../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-larvae-filtered-true.vcf.gz
## Using GATK jar /Applications/bioinformatics/gatk-4.1.9.0/gatk-package-4.1.9.0-local.jar
## Running:
## java -Dsamjdk.use_async_io_read_samtools=false -Dsamjdk.use_async_io_write_samtools=true -Dsamjdk.use_async_io_write_tribble=false -Dsamjdk.compression_level=2 -jar /Applications/bioinformatics/gatk-4.1.9.0/gatk-package-4.1.9.0-local.jar SelectVariants -R ../references/Olurida_v081_concat_rehead.fa -V ../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-larvae-filtered.vcf.gz --exclude-filtered TRUE --select-type-to-include SNP -O ../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-larvae-filtered-true.vcf.gz
## 20:25:07.808 INFO NativeLibraryLoader - Loading libgkl_compression.dylib from jar:file:/Applications/bioinformatics/gatk-4.1.9.0/gatk-package-4.1.9.0-local.jar!/com/intel/gkl/native/libgkl_compression.dylib
## Apr 20, 2021 8:25:08 PM shaded.cloud_nio.com.google.auth.oauth2.ComputeEngineCredentials runningOnComputeEngine
## INFO: Failed to detect whether we are running on Google Compute Engine.
## 20:25:08.075 INFO SelectVariants - ------------------------------------------------------------
## 20:25:08.075 INFO SelectVariants - The Genome Analysis Toolkit (GATK) v4.1.9.0
## 20:25:08.075 INFO SelectVariants - For support and documentation go to https://software.broadinstitute.org/gatk/
## 20:25:08.075 INFO SelectVariants - Executing as laura@lauras-MacBook-Pro-2.local on Mac OS X v10.15.7 x86_64
## 20:25:08.075 INFO SelectVariants - Java runtime: Java HotSpot(TM) 64-Bit Server VM v11.0.1+13-LTS
## 20:25:08.075 INFO SelectVariants - Start Date/Time: April 20, 2021 at 8:25:07 PM PDT
## 20:25:08.075 INFO SelectVariants - ------------------------------------------------------------
## 20:25:08.075 INFO SelectVariants - ------------------------------------------------------------
## 20:25:08.076 INFO SelectVariants - HTSJDK Version: 2.23.0
## 20:25:08.076 INFO SelectVariants - Picard Version: 2.23.3
## 20:25:08.076 INFO SelectVariants - HTSJDK Defaults.COMPRESSION_LEVEL : 2
## 20:25:08.076 INFO SelectVariants - HTSJDK Defaults.USE_ASYNC_IO_READ_FOR_SAMTOOLS : false
## 20:25:08.076 INFO SelectVariants - HTSJDK Defaults.USE_ASYNC_IO_WRITE_FOR_SAMTOOLS : true
## 20:25:08.076 INFO SelectVariants - HTSJDK Defaults.USE_ASYNC_IO_WRITE_FOR_TRIBBLE : false
## 20:25:08.076 INFO SelectVariants - Deflater: IntelDeflater
## 20:25:08.076 INFO SelectVariants - Inflater: IntelInflater
## 20:25:08.076 INFO SelectVariants - GCS max retries/reopens: 20
## 20:25:08.076 INFO SelectVariants - Requester pays: disabled
## 20:25:08.077 INFO SelectVariants - Initializing engine
## 20:25:08.217 INFO FeatureManager - Using codec VCFCodec to read file file:///Volumes/Bumblebee/O.lurida_QuantSeq-2020/notebooks/../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-larvae-filtered.vcf.gz
## 20:25:08.290 INFO SelectVariants - Done initializing engine
## 20:25:08.324 INFO ProgressMeter - Starting traversal
## 20:25:08.325 INFO ProgressMeter - Current Locus Elapsed Minutes Variants Processed Variants/Minute
## 20:25:18.413 INFO ProgressMeter - Contig11179_21262:51558538 0.2 90000 535342.5
## 20:25:28.423 INFO ProgressMeter - Contig28136_34745:55217958 0.3 220000 656781.8
## 20:25:38.432 INFO ProgressMeter - Contig59084_64831:51489531 0.5 370000 737370.0
## 20:25:43.054 INFO ProgressMeter - Contig98527_104152:27381330 0.6 439968 760116.3
## 20:25:43.054 INFO ProgressMeter - Traversal complete. Processed 439968 total variants in 0.6 minutes.
## 20:25:43.102 INFO SelectVariants - Shutting down engine
## [April 20, 2021 at 8:25:43 PM PDT] org.broadinstitute.hellbender.tools.walkers.variantutils.SelectVariants done. Elapsed time: 0.59 minutes.
## Runtime.totalMemory()=322961408
#snpgdsClose(genofile.larvae)
vcf.fn.larvae <- "../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-larvae-filtered-true.vcf.gz"
snpgdsVCF2GDS(vcf.fn.larvae, "../qc-processing/gatk/genotypes-larvae.gds", method="biallelic.only")
## Start file conversion from VCF to SNP GDS ...
## Method: exacting biallelic SNPs
## Number of samples: 76
## Parsing "../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-larvae-filtered-true.vcf.gz" ...
## import 128222 variants.
## + genotype { Bit2 76x128222, 2.3M } *
## Optimize the access efficiency ...
## Clean up the fragments of GDS file:
## open the file '../qc-processing/gatk/genotypes-larvae.gds' (2.9M)
## # of fragments: 59
## save to '../qc-processing/gatk/genotypes-larvae.gds.tmp'
## rename '../qc-processing/gatk/genotypes-larvae.gds.tmp' (2.9M, reduced: 468B)
## # of fragments: 20
snpgdsSummary("../qc-processing/gatk/genotypes-larvae.gds")
## The file name: /Volumes/Bumblebee/O.lurida_QuantSeq-2020/qc-processing/gatk/genotypes-larvae.gds
## The total number of samples: 76
## The total number of SNPs: 128222
## SNP genotypes are stored in SNP-major mode (Sample X SNP).
(genofile.larvae <- snpgdsOpen("../qc-processing/gatk/genotypes-larvae.gds"))
## File: /Volumes/Bumblebee/O.lurida_QuantSeq-2020/qc-processing/gatk/genotypes-larvae.gds (2.9M)
## + [ ] *
## |--+ sample.id { Str8 76 LZMA_ra(63.8%), 201B }
## |--+ snp.id { Int32 128222 LZMA_ra(6.56%), 32.8K }
## |--+ snp.rs.id { Str8 128222 LZMA_ra(0.13%), 173B }
## |--+ snp.position { Int32 128222 LZMA_ra(42.0%), 210.2K }
## |--+ snp.chromosome { Str8 128222 LZMA_ra(0.04%), 857B }
## |--+ snp.allele { Str8 128222 LZMA_ra(12.0%), 60.2K }
## |--+ genotype { Bit2 76x128222, 2.3M } *
## \--+ snp.annot [ ]
## |--+ qual { Float32 128222 LZMA_ra(65.7%), 328.9K }
## \--+ filter { Str8 128222 LZMA_ra(0.04%), 253B }
snpset.larvae <- snpgdsLDpruning(genofile.larvae, maf=.10, missing.rate=0.10, autosome.only=FALSE)
## SNP pruning based on LD:
## Excluding 128,154 SNPs (monomorphic: TRUE, MAF: 0.1, missing rate: 0.1)
## # of samples: 76
## # of SNPs: 68
## using 1 thread
## sliding window: 500,000 basepairs, Inf SNPs
## |LD| threshold: 0.2
## method: composite
## Chromosome Contig0_5313: 0.09%, 14/15,947
## Chromosome Contig104153_109739: 0.06%, 1/1,625
## Chromosome Contig11179_21262: 0.02%, 3/12,305
## Chromosome Contig116188_123130: 0.08%, 1/1,207
## Chromosome Contig129983_136896: 0.08%, 1/1,222
## Chromosome Contig136897_143777: 0.15%, 2/1,321
## Chromosome Contig143780_150710: 0.17%, 2/1,179
## Chromosome Contig181898_197428: 0.09%, 1/1,122
## Chromosome Contig21263_28135: 0.04%, 5/13,677
## Chromosome Contig28136_34745: 0.03%, 4/12,701
## Chromosome Contig34748_41067: 0.06%, 6/10,383
## Chromosome Contig398123_696856: 0.10%, 1/992
## Chromosome Contig41068_47193: 0.09%, 7/7,913
## Chromosome Contig47194_53180: 0.04%, 3/6,968
## Chromosome Contig5314_11178: 0.03%, 1/3,691
## Chromosome Contig53181_59083: 0.05%, 3/5,690
## Chromosome Contig59084_64831: 0.09%, 4/4,432
## Chromosome Contig81766_87352: 0.11%, 3/2,843
## Chromosome Contig87353_92929: 0.05%, 1/1,952
## Chromosome Contig92931_98526: 0.05%, 1/1,900
## Chromosome Contig98527_104152: 0.05%, 1/1,979
## 65 markers are selected in total.
snpset.larvae.id <- unlist(unname(snpset.larvae))
# PCA
pca.larvae <- snpgdsPCA(genofile.larvae, snp.id=snpset.larvae.id, num.thread=2, autosome.only=FALSE)
## Principal Component Analysis (PCA) on genotypes:
## Excluding 0 SNP (monomorphic: TRUE, MAF: NaN, missing rate: NaN)
## # of samples: 76
## # of SNPs: 65
## using 2 threads
## # of principal components: 32
## PCA: the sum of all selected genotypes (0,1,2) = 7538
## CPU capabilities: Double-Precision SSE2
## Tue Apr 20 20:25:49 2021 (internal increment: 13312)
##
[..................................................] 0%, ETC: ---
[==================================================] 100%, completed, 0s
## Tue Apr 20 20:25:49 2021 Begin (eigenvalues and eigenvectors)
## Tue Apr 20 20:25:49 2021 Done.
pc.percent.larvae <- pca.larvae$varprop*100
head(round(pc.percent.larvae, 2))
## [1] 10.00 7.72 5.61 5.10 4.57 4.19
tab.larvae <- data.frame(sample.id = pca.larvae$sample.id,
EV1 = pca.larvae$eigenvect[,1], # the first eigenvector
EV2 = pca.larvae$eigenvect[,2], # the second eigenvector
EV3 = pca.larvae$eigenvect[,3], # the third eigenvector
EV4 = pca.larvae$eigenvect[,4], # the fourth eigenvector
EV5 = pca.larvae$eigenvect[,5], # the fourth eigenvector
stringsAsFactors = FALSE)
tab.larvae.annot <- left_join(tab.larvae, sample.info.larvae, by=c("sample.id"="sample")) %>% droplevels()
group.larvae <- as.factor(tab.larvae.annot$pop_code)
lbls <- paste("PC", 1:5, "\n", format(pc.percent.larvae[1:5], digits=2), "%", sep="")
pairs(pca.larvae$eigenvect[,1:5], col=as.color(tab.larvae.annot$pCO2.parent), labels=lbls, main="colors = pCO2 only")
pairs(pca.larvae$eigenvect[,1:5], col=as.color(tab.larvae.annot$pop_code), labels=lbls, main="colors = pop & pCO2")
Interactive PCA biplots
# PC1 x PC2
# uncomment out the ggplotly lines to interact
#ggplotly(
ggplot(tab.larvae.annot, aes(x=EV2, y=EV1, col=population, shape=pCO2.parent)) +
geom_point(size=4) + theme_minimal() + ggtitle("PC1 x PC2\nLarvae Genetic Structure (pooled batches)")#)
#PC1 x PC3
# uncomment out the ggplotly lines to interact
ggplotly(
ggplot(tab.larvae.annot, aes(x=EV3, y=EV1, col=population, shape=pCO2.parent)) +
geom_point(size=4) + theme_minimal() + ggtitle("PC1 x PC3\nLarvae Genetic Structure (pooled batches)"))
# PC1 x PC4
ggplotly(
ggplot(tab.larvae.annot, aes(x=EV4, y=EV2, col=population, shape=pCO2.parent)) +
geom_point(size=4) + theme_minimal() + ggtitle("PC1 x PC4\nLarvae Gene Expression (pooled batches)"))
# PC1 x PC5
ggplotly(
ggplot(tab.larvae.annot, aes(x=EV5, y=EV1, col=population, shape=pCO2.parent)) +
geom_point(size=4) + theme_minimal() + ggtitle("PC1 x PC5\nLarvae Gene Expression (pooled batches)"))
# To perform cluster analysis on the matrix of genome-wide IBS pairwise distances,
# and determine the groups by a permutation score.
ibs.hc.larvae <- snpgdsHCluster(snpgdsIBS(genofile.larvae, num.thread=2, snp.id=snpset.larvae.id, autosome.only=FALSE,
sample.id=setdiff(tab.larvae.annot$sample.id, "044")))
## Identity-By-State (IBS) analysis on genotypes:
## Excluding 0 SNP (monomorphic: TRUE, MAF: NaN, missing rate: NaN)
## # of samples: 75
## # of SNPs: 65
## using 2 threads
## IBS: the sum of all selected genotypes (0,1,2) = 7516
## Tue Apr 20 20:25:50 2021 (internal increment: 65536)
##
[..................................................] 0%, ETC: ---
[==================================================] 100%, completed, 0s
## Tue Apr 20 20:25:50 2021 Done.
rv.larvae <- snpgdsCutTree(ibs.hc.larvae)
## Determine groups by permutation (Z threshold: 15, outlier threshold: 5):
## Create 1 groups.
plot(rv.larvae$dendrogram, leaflab="none", main="HapMap Phase II")
# Determine groups of individuals by population information
group.larvae <- as.factor(tab.larvae.annot$pop_code)
rv2.larvae <- snpgdsCutTree(ibs.hc.larvae, samp.group=as.factor(subset(tab.larvae.annot, sample.id!="044")$pop_code))
## Create 8 groups.
plot(rv2.larvae$dendrogram, leaflab="none", main="HapMap Phase II")
legend("top", legend=levels(group.larvae), col=1:nlevels(group.larvae), pch=19, ncol=4, cex=0.6, pt.cex=1)
## Larval Allele frequencies
AF.larvae.FB.high <- snpgdsSNPRateFreq(genofile.larvae, snp.id=snpset.larvae.id,
sample.id=subset(tab.larvae.annot, population=="Fidalgo Bay" & pCO2.parent=="High")$sample.id,
with.id = TRUE)
AF.larvae.FB.amb <- snpgdsSNPRateFreq(genofile.larvae, snp.id=snpset.larvae.id,
sample.id=subset(tab.larvae.annot, population=="Fidalgo Bay" & pCO2.parent=="Ambient")$sample.id,
with.id = TRUE)
AF.larvae.DB.high <- snpgdsSNPRateFreq(genofile.larvae, snp.id=snpset.larvae.id,
sample.id=subset(tab.larvae.annot, population=="Dabob Bay" & pCO2.parent=="High")$sample.id,
with.id = TRUE)
AF.larvae.DB.amb <- snpgdsSNPRateFreq(genofile.larvae, snp.id=snpset.larvae.id,
sample.id=subset(tab.larvae.annot, population=="Dabob Bay" & pCO2.parent=="Ambient")$sample.id,
with.id = TRUE)
AF.larvae.OB1.high <- snpgdsSNPRateFreq(genofile.larvae, snp.id=snpset.larvae.id,
sample.id=subset(tab.larvae.annot, population=="Oyster Bay C1" & pCO2.parent=="High")$sample.id,
with.id = TRUE)
AF.larvae.OB1.amb <- snpgdsSNPRateFreq(genofile.larvae, snp.id=snpset.larvae.id,
sample.id=subset(tab.larvae.annot, population=="Oyster Bay C1" & pCO2.parent=="Ambient")$sample.id,
with.id = TRUE)
AF.larvae.OB2.high <- snpgdsSNPRateFreq(genofile.larvae, snp.id=snpset.larvae.id,
sample.id=subset(tab.larvae.annot, population=="Oyster Bay C2" & pCO2.parent=="High")$sample.id,
with.id = TRUE)
AF.larvae.OB2.amb <- snpgdsSNPRateFreq(genofile.larvae, snp.id=snpset.larvae.id,
sample.id=subset(tab.larvae.annot, population=="Oyster Bay C2" & pCO2.parent=="Ambient")$sample.id,
with.id = TRUE)
AF.larvae <- rbind(do.call(cbind.data.frame, AF.larvae.FB.high[c("snp.id", "AlleleFreq")]) %>% mutate(population=as.factor("FB"), stage=as.factor("larvae"), pCO2=as.factor("high")),
do.call(cbind.data.frame, AF.larvae.FB.amb[c("snp.id", "AlleleFreq")]) %>% mutate(population=as.factor("FB"), stage=as.factor("larvae"), pCO2=as.factor("ambient")),
do.call(cbind.data.frame, AF.larvae.DB.high[c("snp.id", "AlleleFreq")]) %>% mutate(population=as.factor("DB"), stage=as.factor("larvae"), pCO2=as.factor("high")),
do.call(cbind.data.frame, AF.larvae.DB.amb[c("snp.id", "AlleleFreq")]) %>% mutate(population=as.factor("DB"), stage=as.factor("larvae"), pCO2=as.factor("ambient")),
do.call(cbind.data.frame, AF.larvae.OB1.high[c("snp.id", "AlleleFreq")]) %>% mutate(population=as.factor("OB1"), stage=as.factor("larvae"), pCO2=as.factor("high")),
do.call(cbind.data.frame, AF.larvae.OB1.amb[c("snp.id", "AlleleFreq")]) %>% mutate(population=as.factor("OB1"), stage=as.factor("larvae"), pCO2=as.factor("ambient")),
do.call(cbind.data.frame, AF.larvae.OB2.high[c("snp.id", "AlleleFreq")]) %>% mutate(population=as.factor("OB2"), stage=as.factor("larvae"), pCO2=as.factor("high")),
do.call(cbind.data.frame, AF.larvae.OB2.amb[c("snp.id", "AlleleFreq")]) %>% mutate(population=as.factor("OB2"), stage=as.factor("larvae"), pCO2=as.factor("ambient")))
ggplot(AF.larvae, aes(x=pCO2, y=AlleleFreq, fill=population, alpha=pCO2)) + geom_violin(trim = F) + facet_wrap(~population, ncol=4) + theme_minimal() +scale_alpha_discrete(range = c(0.35, 0.9)) +
stat_summary(fun.y=mean, geom="point", shape=8, size=4, color="black", fill="black") + xlab("pCO2 exposure") + ylab("Allele Frequency") +
ggtitle(paste("SNP Allele Frequency, larvaes by population & pCO2 treatment\n", "(No. loci = ", length(snpset.larvae.id), ")", sep="")) +
geom_text(data=AF.larvae %>% group_by(population, pCO2) %>% summarize(mean=mean(AlleleFreq)), aes(x=pCO2, y=mean+.1, label=round(mean, 2))) + theme(legend.position = "none")
## Warning: Using alpha for a discrete variable is not advised.
## Warning: `fun.y` is deprecated. Use `fun` instead.
## `summarise()` has grouped output by 'population'. You can override using the `.groups` argument.
# within FB (comparing pCO2 exposure):
Fst.larvae.FB <- snpgdsFst(genofile.larvae, snp.id=snpset.larvae.id,
sample.id=subset(tab.larvae.annot, population=="Fidalgo Bay")$sample.id,
population=as.factor(subset(tab.larvae.annot, population=="Fidalgo Bay")$pCO2.parent),
method="W&C84", autosome.only=FALSE, with.id = TRUE)
## Fst estimation on genotypes:
## Excluding 7 SNPs (monomorphic: TRUE, MAF: NaN, missing rate: NaN)
## # of samples: 17
## # of SNPs: 58
## Method: Weir & Cockerham, 1984
## # of Populations: 2
## Ambient (7), High (10)
paste("Weir and Cockerham mean Fst estimate, Fidalgo Bay Ambient vs. High: ", round(Fst.larvae.FB$MeanFst, 5))
## [1] "Weir and Cockerham mean Fst estimate, Fidalgo Bay Ambient vs. High: 0.00852"
hist(Fst.larvae.FB$FstSNP, breaks = 75, main="Fst distribution\nFidalgo Bay Ambient vs. High, Larvae", col="dodgerblue3", xlim = c(-.15, 0.8))
# within DB (comparing pCO2 exposure):
Fst.larvae.DB <- snpgdsFst(genofile.larvae, snp.id=snpset.larvae.id,
sample.id=subset(tab.larvae.annot, population=="Dabob Bay")$sample.id,
population=as.factor(subset(tab.larvae.annot, population=="Dabob Bay")$pCO2.parent),
method="W&C84", autosome.only=FALSE, with.id = TRUE)
## Fst estimation on genotypes:
## Excluding 18 SNPs (monomorphic: TRUE, MAF: NaN, missing rate: NaN)
## # of samples: 12
## # of SNPs: 47
## Method: Weir & Cockerham, 1984
## # of Populations: 2
## Ambient (5), High (7)
paste("Weir and Cockerham mean Fst estimate, Dabob Bay Ambient vs. High: ", round(Fst.larvae.DB$MeanFst, 5))
## [1] "Weir and Cockerham mean Fst estimate, Dabob Bay Ambient vs. High: 0.01232"
hist(Fst.larvae.DB$FstSNP, breaks = 100, main="Fst distribution\nDabob Bay Ambient vs. High, Larvae", col="darkseagreen3", xlim = c(-.2, .8))
# within OB1 (comparing pCO2 exposure):
Fst.larvae.OB1 <- snpgdsFst(genofile.larvae, snp.id=snpset.larvae.id,
sample.id=subset(tab.larvae.annot, population=="Oyster Bay C1")$sample.id,
population=as.factor(subset(tab.larvae.annot, population=="Oyster Bay C1")$pCO2.parent),
method="W&C84", autosome.only=FALSE, with.id = TRUE)
## Fst estimation on genotypes:
## Excluding 2 SNPs (monomorphic: TRUE, MAF: NaN, missing rate: NaN)
## # of samples: 32
## # of SNPs: 63
## Method: Weir & Cockerham, 1984
## # of Populations: 2
## Ambient (16), High (16)
paste("Weir and Cockerham mean Fst estimate, Oyster Bay C1 Ambient vs. High: ", round(Fst.larvae.OB1$MeanFst, 5))
## [1] "Weir and Cockerham mean Fst estimate, Oyster Bay C1 Ambient vs. High: 0.00041"
hist(Fst.larvae.OB1$FstSNP, breaks = 100, main="Fst distribution\nOyster Bay C1 Ambient vs. High, Larvae", col="salmon2", xlim = c(-.2, 0.8))
Fst.larvae <- rbind(do.call(cbind.data.frame, Fst.larvae.FB[c("snp.id", "FstSNP")]) %>% mutate(population=as.factor("FB"), stage=as.factor("larvae")),
do.call(cbind.data.frame, Fst.larvae.DB[c("snp.id", "FstSNP")]) %>% mutate(population=as.factor("DB"), stage=as.factor("larvae")),
do.call(cbind.data.frame, Fst.larvae.OB1[c("snp.id", "FstSNP")]) %>% mutate(population=as.factor("OB1"), stage=as.factor("larvae")))
ggplot(Fst.larvae, aes(x=population, y=FstSNP, fill=population)) + geom_violin(trim = F) +
stat_summary(fun.y=mean, geom="point", shape=8, size=3.5, color="black", fill="black") +
ggtitle(paste("SNP Fst, Larvae by pCO2 treatment\n", "(No. loci = ", length(snpset.larvae.id), ")", sep="")) +
geom_text(data=Fst.larvae %>% group_by(population) %>% summarize(mean=mean(FstSNP)), aes(x=population, y=.78, label=round(mean, 3))) +
theme_minimal()
## Warning: `fun.y` is deprecated. Use `fun` instead.
Larval Identify-by-state
ibs.larvae <- snpgdsIBS(genofile.larvae, num.thread=2, snp.id=snpset.larvae.id, autosome.only=FALSE, sample.id=setdiff(tab.larvae.annot$sample.id, "044"))
## Identity-By-State (IBS) analysis on genotypes:
## Excluding 0 SNP (monomorphic: TRUE, MAF: NaN, missing rate: NaN)
## # of samples: 75
## # of SNPs: 65
## using 2 threads
## IBS: the sum of all selected genotypes (0,1,2) = 7516
## Tue Apr 20 20:25:51 2021 (internal increment: 65536)
##
[..................................................] 0%, ETC: ---
[==================================================] 100%, completed, 0s
## Tue Apr 20 20:25:51 2021 Done.
pop.idx.larvae <- order(tab.larvae.annot$pop_code)
#perform multidimensional scaling analysis on the matrix of genome-wide IBS pairwise distances:
loc.larvae <- cmdscale(1 - ibs.larvae$ibs, k = 2)
x.larvae <- loc.larvae[, 1]; y.larvae <- loc.larvae[, 2]
a <- as.factor(subset(tab.larvae.annot, sample.id!="044")$population)
b <- as.numeric(as.factor(subset(tab.larvae.annot, sample.id!="044")$pCO2.parent))
par(xpd=TRUE)
plot(x.larvae, y.larvae, col=group.larvae, xlab = "", ylab = "",
main = "Multidimensional Scaling Analysis (IBS)", cex=2, pch=16)
#text(x.larvae, y.larvae+.01, labels=tab.larvae.annot$sample.id, cex=0.7, font=2, col=as.integer(group.larvae))
legend("bottomright", inset=c(-.065,0), legend=levels(group.larvae), cex=.6, pch="o", text.col=1:nlevels(group.larvae))
# Option to plot color coded by pop, symbol by pCO2
#plot(x.larvae, y.larvae, col=a, pch=b, xlab = "", ylab = "",
# main = "Multidimensional Scaling Analysis (IBS)", cex=1)
/Applications/bioinformatics/gatk-4.1.9.0/gatk VariantFiltration \
-R ../references/Olurida_v081_concat_rehead.fa \
-V ../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-juvenile.vcf.gz \
-O ../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-juvenile-filtered.vcf.gz \
--filter-name "FS" \
--filter "FS > 60.0" \
--filter-name "QUAL30" \
--filter "QUAL < 30.0" \
--filter-name "SOR3" \
--filter "SOR > 3.0" \
--filter-name "DP50" \
--filter "DP < 50" \
--filter-name "DP450" \
--filter "DP > 450"
## Using GATK jar /Applications/bioinformatics/gatk-4.1.9.0/gatk-package-4.1.9.0-local.jar
## Running:
## java -Dsamjdk.use_async_io_read_samtools=false -Dsamjdk.use_async_io_write_samtools=true -Dsamjdk.use_async_io_write_tribble=false -Dsamjdk.compression_level=2 -jar /Applications/bioinformatics/gatk-4.1.9.0/gatk-package-4.1.9.0-local.jar VariantFiltration -R ../references/Olurida_v081_concat_rehead.fa -V ../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-juvenile.vcf.gz -O ../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-juvenile-filtered.vcf.gz --filter-name FS --filter FS > 60.0 --filter-name QUAL30 --filter QUAL < 30.0 --filter-name SOR3 --filter SOR > 3.0 --filter-name DP50 --filter DP < 50 --filter-name DP450 --filter DP > 450
## 20:25:53.658 INFO NativeLibraryLoader - Loading libgkl_compression.dylib from jar:file:/Applications/bioinformatics/gatk-4.1.9.0/gatk-package-4.1.9.0-local.jar!/com/intel/gkl/native/libgkl_compression.dylib
## Apr 20, 2021 8:25:53 PM shaded.cloud_nio.com.google.auth.oauth2.ComputeEngineCredentials runningOnComputeEngine
## INFO: Failed to detect whether we are running on Google Compute Engine.
## 20:25:53.829 INFO VariantFiltration - ------------------------------------------------------------
## 20:25:53.829 INFO VariantFiltration - The Genome Analysis Toolkit (GATK) v4.1.9.0
## 20:25:53.829 INFO VariantFiltration - For support and documentation go to https://software.broadinstitute.org/gatk/
## 20:25:53.829 INFO VariantFiltration - Executing as laura@lauras-MacBook-Pro-2.local on Mac OS X v10.15.7 x86_64
## 20:25:53.829 INFO VariantFiltration - Java runtime: Java HotSpot(TM) 64-Bit Server VM v11.0.1+13-LTS
## 20:25:53.829 INFO VariantFiltration - Start Date/Time: April 20, 2021 at 8:25:53 PM PDT
## 20:25:53.830 INFO VariantFiltration - ------------------------------------------------------------
## 20:25:53.830 INFO VariantFiltration - ------------------------------------------------------------
## 20:25:53.830 INFO VariantFiltration - HTSJDK Version: 2.23.0
## 20:25:53.830 INFO VariantFiltration - Picard Version: 2.23.3
## 20:25:53.830 INFO VariantFiltration - HTSJDK Defaults.COMPRESSION_LEVEL : 2
## 20:25:53.830 INFO VariantFiltration - HTSJDK Defaults.USE_ASYNC_IO_READ_FOR_SAMTOOLS : false
## 20:25:53.830 INFO VariantFiltration - HTSJDK Defaults.USE_ASYNC_IO_WRITE_FOR_SAMTOOLS : true
## 20:25:53.830 INFO VariantFiltration - HTSJDK Defaults.USE_ASYNC_IO_WRITE_FOR_TRIBBLE : false
## 20:25:53.830 INFO VariantFiltration - Deflater: IntelDeflater
## 20:25:53.830 INFO VariantFiltration - Inflater: IntelInflater
## 20:25:53.830 INFO VariantFiltration - GCS max retries/reopens: 20
## 20:25:53.831 INFO VariantFiltration - Requester pays: disabled
## 20:25:53.831 INFO VariantFiltration - Initializing engine
## 20:25:53.930 INFO FeatureManager - Using codec VCFCodec to read file file:///Volumes/Bumblebee/O.lurida_QuantSeq-2020/notebooks/../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-juvenile.vcf.gz
## 20:25:53.998 INFO VariantFiltration - Done initializing engine
## 20:25:54.054 INFO ProgressMeter - Starting traversal
## 20:25:54.055 INFO ProgressMeter - Current Locus Elapsed Minutes Variants Processed Variants/Minute
## 20:25:59.122 INFO ProgressMeter - Contig98527_104152:26467437 0.1 280822 3325961.3
## 20:25:59.122 INFO ProgressMeter - Traversal complete. Processed 280822 total variants in 0.1 minutes.
## 20:25:59.185 INFO VariantFiltration - Shutting down engine
## [April 20, 2021 at 8:25:59 PM PDT] org.broadinstitute.hellbender.tools.walkers.filters.VariantFiltration done. Elapsed time: 0.09 minutes.
## Runtime.totalMemory()=322961408
/Applications/bioinformatics/gatk-4.1.9.0/gatk SelectVariants \
-R ../references/Olurida_v081_concat_rehead.fa \
-V ../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-juvenile-filtered.vcf.gz \
--exclude-filtered TRUE \
--select-type-to-include SNP \
-O ../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-juvenile-filtered-true.vcf.gz
## Using GATK jar /Applications/bioinformatics/gatk-4.1.9.0/gatk-package-4.1.9.0-local.jar
## Running:
## java -Dsamjdk.use_async_io_read_samtools=false -Dsamjdk.use_async_io_write_samtools=true -Dsamjdk.use_async_io_write_tribble=false -Dsamjdk.compression_level=2 -jar /Applications/bioinformatics/gatk-4.1.9.0/gatk-package-4.1.9.0-local.jar SelectVariants -R ../references/Olurida_v081_concat_rehead.fa -V ../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-juvenile-filtered.vcf.gz --exclude-filtered TRUE --select-type-to-include SNP -O ../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-juvenile-filtered-true.vcf.gz
## 20:26:01.519 INFO NativeLibraryLoader - Loading libgkl_compression.dylib from jar:file:/Applications/bioinformatics/gatk-4.1.9.0/gatk-package-4.1.9.0-local.jar!/com/intel/gkl/native/libgkl_compression.dylib
## Apr 20, 2021 8:26:01 PM shaded.cloud_nio.com.google.auth.oauth2.ComputeEngineCredentials runningOnComputeEngine
## INFO: Failed to detect whether we are running on Google Compute Engine.
## 20:26:01.770 INFO SelectVariants - ------------------------------------------------------------
## 20:26:01.770 INFO SelectVariants - The Genome Analysis Toolkit (GATK) v4.1.9.0
## 20:26:01.770 INFO SelectVariants - For support and documentation go to https://software.broadinstitute.org/gatk/
## 20:26:01.770 INFO SelectVariants - Executing as laura@lauras-MacBook-Pro-2.local on Mac OS X v10.15.7 x86_64
## 20:26:01.770 INFO SelectVariants - Java runtime: Java HotSpot(TM) 64-Bit Server VM v11.0.1+13-LTS
## 20:26:01.770 INFO SelectVariants - Start Date/Time: April 20, 2021 at 8:26:01 PM PDT
## 20:26:01.770 INFO SelectVariants - ------------------------------------------------------------
## 20:26:01.770 INFO SelectVariants - ------------------------------------------------------------
## 20:26:01.771 INFO SelectVariants - HTSJDK Version: 2.23.0
## 20:26:01.771 INFO SelectVariants - Picard Version: 2.23.3
## 20:26:01.771 INFO SelectVariants - HTSJDK Defaults.COMPRESSION_LEVEL : 2
## 20:26:01.771 INFO SelectVariants - HTSJDK Defaults.USE_ASYNC_IO_READ_FOR_SAMTOOLS : false
## 20:26:01.771 INFO SelectVariants - HTSJDK Defaults.USE_ASYNC_IO_WRITE_FOR_SAMTOOLS : true
## 20:26:01.771 INFO SelectVariants - HTSJDK Defaults.USE_ASYNC_IO_WRITE_FOR_TRIBBLE : false
## 20:26:01.772 INFO SelectVariants - Deflater: IntelDeflater
## 20:26:01.772 INFO SelectVariants - Inflater: IntelInflater
## 20:26:01.772 INFO SelectVariants - GCS max retries/reopens: 20
## 20:26:01.772 INFO SelectVariants - Requester pays: disabled
## 20:26:01.772 INFO SelectVariants - Initializing engine
## 20:26:01.918 INFO FeatureManager - Using codec VCFCodec to read file file:///Volumes/Bumblebee/O.lurida_QuantSeq-2020/notebooks/../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-juvenile-filtered.vcf.gz
## 20:26:01.992 INFO SelectVariants - Done initializing engine
## 20:26:02.032 INFO ProgressMeter - Starting traversal
## 20:26:02.032 INFO ProgressMeter - Current Locus Elapsed Minutes Variants Processed Variants/Minute
## 20:26:09.533 INFO ProgressMeter - Contig98527_104152:24952552 0.1 222831 1782648.0
## 20:26:09.533 INFO ProgressMeter - Traversal complete. Processed 222831 total variants in 0.1 minutes.
## 20:26:09.580 INFO SelectVariants - Shutting down engine
## [April 20, 2021 at 8:26:09 PM PDT] org.broadinstitute.hellbender.tools.walkers.variantutils.SelectVariants done. Elapsed time: 0.14 minutes.
## Runtime.totalMemory()=322961408
#snpgdsClose(genofile.juvenile)
vcf.fn.juvenile <- "../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-juvenile-filtered-true.vcf.gz"
snpgdsVCF2GDS(vcf.fn.juvenile, "../qc-processing/gatk/genotypes-juvenile.gds", method="biallelic.only")
## Start file conversion from VCF to SNP GDS ...
## Method: exacting biallelic SNPs
## Number of samples: 16
## Parsing "../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-juvenile-filtered-true.vcf.gz" ...
## import 50608 variants.
## + genotype { Bit2 16x50608, 197.7K } *
## Optimize the access efficiency ...
## Clean up the fragments of GDS file:
## open the file '../qc-processing/gatk/genotypes-juvenile.gds' (459.9K)
## # of fragments: 49
## save to '../qc-processing/gatk/genotypes-juvenile.gds.tmp'
## rename '../qc-processing/gatk/genotypes-juvenile.gds.tmp' (459.5K, reduced: 348B)
## # of fragments: 20
snpgdsSummary("../qc-processing/gatk/genotypes-juvenile.gds")
## The file name: /Volumes/Bumblebee/O.lurida_QuantSeq-2020/qc-processing/gatk/genotypes-juvenile.gds
## The total number of samples: 16
## The total number of SNPs: 50608
## SNP genotypes are stored in SNP-major mode (Sample X SNP).
(genofile.juvenile <- snpgdsOpen("../qc-processing/gatk/genotypes-juvenile.gds"))
## File: /Volumes/Bumblebee/O.lurida_QuantSeq-2020/qc-processing/gatk/genotypes-juvenile.gds (459.5K)
## + [ ] *
## |--+ sample.id { Str8 16 LZMA_ra(178.1%), 121B }
## |--+ snp.id { Int32 50608 LZMA_ra(7.08%), 14.0K }
## |--+ snp.rs.id { Str8 50608 LZMA_ra(0.30%), 157B }
## |--+ snp.position { Int32 50608 LZMA_ra(43.1%), 85.1K }
## |--+ snp.chromosome { Str8 50608 LZMA_ra(0.07%), 649B }
## |--+ snp.allele { Str8 50608 LZMA_ra(12.3%), 24.2K }
## |--+ genotype { Bit2 16x50608, 197.7K } *
## \--+ snp.annot [ ]
## |--+ qual { Float32 50608 LZMA_ra(68.5%), 135.4K }
## \--+ filter { Str8 50608 LZMA_ra(0.08%), 197B }
snpset.juvenile <- snpgdsLDpruning(genofile.juvenile, maf=.1, missing.rate=0, autosome.only=FALSE)
## SNP pruning based on LD:
## Excluding 41,649 SNPs (monomorphic: TRUE, MAF: 0.1, missing rate: 0)
## # of samples: 16
## # of SNPs: 8,959
## using 1 thread
## sliding window: 500,000 basepairs, Inf SNPs
## |LD| threshold: 0.2
## method: composite
## Chromosome Contig0_5313: 4.90%, 281/5,731
## Chromosome Contig104153_109739: 5.33%, 40/750
## Chromosome Contig109741_116186: 5.22%, 35/670
## Chromosome Contig11179_21262: 4.22%, 190/4,498
## Chromosome Contig116188_123130: 6.21%, 36/580
## Chromosome Contig123131_129982: 5.21%, 22/422
## Chromosome Contig129983_136896: 6.16%, 33/536
## Chromosome Contig136897_143777: 5.40%, 36/667
## Chromosome Contig143780_150710: 6.58%, 32/486
## Chromosome Contig150711_166372: 6.15%, 28/455
## Chromosome Contig166374_181896: 6.36%, 35/550
## Chromosome Contig181898_197428: 4.96%, 21/423
## Chromosome Contig197431_213218: 4.96%, 28/564
## Chromosome Contig21263_28135: 4.55%, 227/4,994
## Chromosome Contig213221_398012: 6.67%, 34/510
## Chromosome Contig28136_34745: 4.36%, 208/4,772
## Chromosome Contig34748_41067: 5.00%, 191/3,817
## Chromosome Contig398123_696856: 5.88%, 28/476
## Chromosome Contig41068_47193: 4.97%, 159/3,201
## Chromosome Contig47194_53180: 5.18%, 146/2,819
## Chromosome Contig5314_11178: 4.95%, 70/1,414
## Chromosome Contig53181_59083: 5.17%, 131/2,534
## Chromosome Contig59084_64831: 5.86%, 105/1,792
## Chromosome Contig64832_70524: 5.38%, 89/1,654
## Chromosome Contig70525_76135: 5.71%, 76/1,331
## Chromosome Contig76136_81765: 5.87%, 78/1,329
## Chromosome Contig81766_87352: 5.99%, 72/1,203
## Chromosome Contig87353_92929: 6.24%, 54/866
## Chromosome Contig92931_98526: 5.94%, 47/791
## Chromosome Contig98527_104152: 6.99%, 54/773
## 2,586 markers are selected in total.
snpset.juvenile.id <- unlist(unname(snpset.juvenile))
# PCA
pca.juvenile <- snpgdsPCA(genofile.juvenile, snp.id=snpset.juvenile.id, num.thread=2, autosome.only=FALSE)
## Principal Component Analysis (PCA) on genotypes:
## Excluding 0 SNP (monomorphic: TRUE, MAF: NaN, missing rate: NaN)
## # of samples: 16
## # of SNPs: 2,586
## using 2 threads
## # of principal components: 32
## PCA: the sum of all selected genotypes (0,1,2) = 57125
## CPU capabilities: Double-Precision SSE2
## Tue Apr 20 20:26:11 2021 (internal increment: 63232)
##
[..................................................] 0%, ETC: ---
[==================================================] 100%, completed, 0s
## Tue Apr 20 20:26:11 2021 Begin (eigenvalues and eigenvectors)
## Tue Apr 20 20:26:11 2021 Done.
pc.percent.juvenile <- pca.juvenile$varprop*100
head(round(pc.percent.juvenile, 2))
## [1] 19.25 12.89 8.97 7.25 7.08 6.33
tab.juvenile <- data.frame(sample.id = pca.juvenile$sample.id,
EV1 = pca.juvenile$eigenvect[,1], # the first eigenvector
EV2 = pca.juvenile$eigenvect[,2], # the second eigenvector
EV3 = pca.juvenile$eigenvect[,3], # the third eigenvector
EV4 = pca.juvenile$eigenvect[,4], # the fourth eigenvector
EV5 = pca.juvenile$eigenvect[,5], # the fourth eigenvector
stringsAsFactors = FALSE)
tab.juvenile.annot <- left_join(tab.juvenile, sample.info.juvenile, by=c("sample.id"="sample")) %>% droplevels()
group.juvenile <- as.factor(tab.juvenile.annot$pop_code)
lbls <- paste("PC", 1:5, "\n", format(pc.percent.juvenile[1:5], digits=2), "%", sep="")
pairs(pca.juvenile$eigenvect[,1:5], col=as.color(tab.juvenile.annot$pCO2.parent), labels=lbls, main="colors = pCO2 only")
pairs(pca.juvenile$eigenvect[,1:5], col=as.color(tab.juvenile.annot$pop_code), labels=lbls, main="colors = pop & pCO2")
Interactive PCAs
# PC1 x PC2
# uncomment out the ggplotly lines to interact
ggplotly(
ggplot(tab.juvenile.annot, aes(x=EV2, y=EV1, col=population, shape=pCO2.parent)) +
geom_point(size=4) + theme_minimal() + ggtitle("PC1 x PC2\njuvenile Genetic Structure (pooled batches)"))
#PC1 x PC3
# uncomment out the ggplotly lines to interact
ggplotly(
ggplot(tab.juvenile.annot, aes(x=EV3, y=EV1, col=population, shape=pCO2.parent)) +
geom_point(size=4) + theme_minimal() + ggtitle("PC1 x PC3\njuvenile Genetic Structure (pooled batches)"))
#PC2 x PC3
# uncomment out the ggplotly lines to interact
ggplotly(
ggplot(tab.juvenile.annot, aes(x=EV3, y=EV2, col=population, shape=pCO2.parent)) +
geom_point(size=4) + theme_minimal() + ggtitle("PC2 x PC3\njuvenile Genetic Structure (pooled batches)"))
# To perform cluster analysis on the matrix of genome-wide IBS pairwise distances,
# and determine the groups by a permutation score.
ibs.hc.juvenile <- snpgdsHCluster(snpgdsIBS(genofile.juvenile, num.thread=2, snp.id=snpset.juvenile.id, autosome.only=FALSE,
sample.id=setdiff(tab.juvenile.annot$sample.id, "044")))
## Identity-By-State (IBS) analysis on genotypes:
## Excluding 0 SNP (monomorphic: TRUE, MAF: NaN, missing rate: NaN)
## # of samples: 16
## # of SNPs: 2,586
## using 2 threads
## IBS: the sum of all selected genotypes (0,1,2) = 57125
## Tue Apr 20 20:26:11 2021 (internal increment: 65536)
##
[..................................................] 0%, ETC: ---
[==================================================] 100%, completed, 0s
## Tue Apr 20 20:26:11 2021 Done.
rv.juvenile <- snpgdsCutTree(ibs.hc.juvenile)
## Determine groups by permutation (Z threshold: 15, outlier threshold: 5):
## Create 1 groups.
plot(rv.juvenile$dendrogram, leaflab="none", main="HapMap Phase II")
# Determine groups of individuals by population information
rv2.juvenile <- snpgdsCutTree(ibs.hc.juvenile, samp.group=as.factor(subset(tab.juvenile.annot, sample.id!="044")$pop_code))
## Create 4 groups.
plot(rv2.juvenile$dendrogram, leaflab="none", main="HapMap Phase II")
legend("top", legend=levels(group.juvenile), col=1:nlevels(group.juvenile), pch=19, ncol=4, cex=0.6, pt.cex=1)
## Juvenile Allele frequencies
AF.juvenile.FB.high <- snpgdsSNPRateFreq(genofile.juvenile, snp.id=snpset.juvenile.id,
sample.id=subset(tab.juvenile.annot, population=="Fidalgo Bay" & pCO2.parent=="High")$sample.id,
with.id = TRUE)
AF.juvenile.FB.amb <- snpgdsSNPRateFreq(genofile.juvenile, snp.id=snpset.juvenile.id,
sample.id=subset(tab.juvenile.annot, population=="Fidalgo Bay" & pCO2.parent=="Ambient")$sample.id,
with.id = TRUE)
AF.juvenile.DB.high <- snpgdsSNPRateFreq(genofile.juvenile, snp.id=snpset.juvenile.id,
sample.id=subset(tab.juvenile.annot, population=="Dabob Bay" & pCO2.parent=="High")$sample.id,
with.id = TRUE)
AF.juvenile.DB.amb <- snpgdsSNPRateFreq(genofile.juvenile, snp.id=snpset.juvenile.id,
sample.id=subset(tab.juvenile.annot, population=="Dabob Bay" & pCO2.parent=="Ambient")$sample.id,
with.id = TRUE)
AF.juveniles <- rbind(do.call(cbind.data.frame, AF.juvenile.FB.high[c("snp.id", "AlleleFreq")]) %>% mutate(population=as.factor("FB"), stage=as.factor("juvenile"), pCO2=as.factor("high")),
do.call(cbind.data.frame, AF.juvenile.FB.amb[c("snp.id", "AlleleFreq")]) %>% mutate(population=as.factor("FB"), stage=as.factor("juvenile"), pCO2=as.factor("ambient")),
do.call(cbind.data.frame, AF.juvenile.DB.high[c("snp.id", "AlleleFreq")]) %>% mutate(population=as.factor("DB"), stage=as.factor("juvenile"), pCO2=as.factor("high")),
do.call(cbind.data.frame, AF.juvenile.DB.amb[c("snp.id", "AlleleFreq")]) %>% mutate(population=as.factor("DB"), stage=as.factor("juvenile"), pCO2=as.factor("ambient")))
ggplot(AF.juveniles, aes(x=pCO2, y=AlleleFreq, fill=population, alpha=pCO2)) + geom_violin(trim = F) + facet_wrap(~population) + theme_minimal() +scale_alpha_discrete(range = c(0.35, 0.9)) +
stat_summary(fun.y=mean, geom="point", shape=8, size=4, color="black", fill="black") + xlab("pCO2 exposure") + ylab("Allele Frequency") +
ggtitle(paste("SNP Allele Frequency, juveniles by population & pCO2 treatment\n", "(No. loci = ", length(snpset.juvenile.id), ")", sep="")) +
geom_text(data=AF.juveniles %>% group_by(population, pCO2) %>% summarize(mean=mean(AlleleFreq)), aes(x=pCO2, y=mean+.1, label=round(mean, 2)))
## Warning: Using alpha for a discrete variable is not advised.
## Warning: `fun.y` is deprecated. Use `fun` instead.
## `summarise()` has grouped output by 'population'. You can override using the `.groups` argument.
# within FB (comparing pCO2 exposure):
Fst.juvenile.FB <- snpgdsFst(genofile.juvenile, snp.id=snpset.juvenile.id,
sample.id=subset(tab.juvenile.annot, population=="Fidalgo Bay")$sample.id,
population=as.factor(subset(tab.juvenile.annot, population=="Fidalgo Bay")$pCO2.parent),
method="W&C84", autosome.only=FALSE, with.id = TRUE)
## Fst estimation on genotypes:
## Excluding 251 SNPs (monomorphic: TRUE, MAF: NaN, missing rate: NaN)
## # of samples: 8
## # of SNPs: 2,335
## Method: Weir & Cockerham, 1984
## # of Populations: 2
## Ambient (4), High (4)
paste("Weir and Cockerham mean Fst estimate, Fidalgo Bay Ambient vs. High: ", round(Fst.juvenile.FB$MeanFst, 5))
## [1] "Weir and Cockerham mean Fst estimate, Fidalgo Bay Ambient vs. High: 0.03405"
hist(Fst.juvenile.FB$FstSNP, breaks = 75, main="Fst distribution\nFidalgo Bay Ambient vs. High, juvenile", col="dodgerblue3", xlim = c(-.15, 0.8))
# within DB (comparing pCO2 exposure):
Fst.juvenile.DB <- snpgdsFst(genofile.juvenile, snp.id=snpset.juvenile.id,
sample.id=subset(tab.juvenile.annot, population=="Dabob Bay")$sample.id,
population=as.factor(subset(tab.juvenile.annot, population=="Dabob Bay")$pCO2.parent),
method="W&C84", autosome.only=FALSE, with.id = TRUE)
## Fst estimation on genotypes:
## Excluding 276 SNPs (monomorphic: TRUE, MAF: NaN, missing rate: NaN)
## # of samples: 8
## # of SNPs: 2,310
## Method: Weir & Cockerham, 1984
## # of Populations: 2
## Ambient (4), High (4)
paste("Weir and Cockerham mean Fst estimate, Dabob Bay Ambient vs. High: ", round(Fst.juvenile.DB$MeanFst, 5))
## [1] "Weir and Cockerham mean Fst estimate, Dabob Bay Ambient vs. High: 0.1068"
hist(Fst.juvenile.DB$FstSNP, breaks = 100, main="Fst distribution\nDabob Bay Ambient vs. High, juvenile", col="darkseagreen3", xlim = c(-.2, .8))
Fst.juvenile <- rbind(do.call(cbind.data.frame, Fst.juvenile.FB[c("snp.id", "FstSNP")]) %>% mutate(population=as.factor("FB"), stage=as.factor("juvenile")),
do.call(cbind.data.frame, Fst.juvenile.DB[c("snp.id", "FstSNP")]) %>% mutate(population=as.factor("DB"), stage=as.factor("juvenile")))
ggplot(Fst.juvenile, aes(x=population, y=FstSNP, fill=population)) + geom_violin(trim = F) +
stat_summary(fun.y=mean, geom="point", shape=8, size=3.5, color="black", fill="black") +
ggtitle(paste("SNP Fst, juvenile by pCO2 treatment\n", "(No. loci = ", length(snpset.juvenile.id), ")", sep="")) +
geom_text(data=Fst.juvenile %>% group_by(population) %>% summarize(mean=mean(FstSNP)), aes(x=population, y=1.1, label=round(mean, 3))) +
theme_minimal()
## Warning: `fun.y` is deprecated. Use `fun` instead.
ibs.juvenile <- snpgdsIBS(genofile.juvenile, num.thread=2, snp.id=snpset.juvenile.id, autosome.only=FALSE, sample.id=setdiff(tab.juvenile.annot$sample.id, "044"))
## Identity-By-State (IBS) analysis on genotypes:
## Excluding 0 SNP (monomorphic: TRUE, MAF: NaN, missing rate: NaN)
## # of samples: 16
## # of SNPs: 2,586
## using 2 threads
## IBS: the sum of all selected genotypes (0,1,2) = 57125
## Tue Apr 20 20:26:12 2021 (internal increment: 65536)
##
[..................................................] 0%, ETC: ---
[==================================================] 100%, completed, 0s
## Tue Apr 20 20:26:12 2021 Done.
pop.idx.juvenile <- order(tab.juvenile.annot$pop_code)
#perform multidimensional scaling analysis on the matrix of genome-wide IBS pairwise distances:
loc.juvenile <- cmdscale(1 - ibs.juvenile$ibs, k = 2)
x.juvenile <- loc.juvenile[, 1]; y.juvenile <- loc.juvenile[, 2]
a <- as.factor(subset(tab.juvenile.annot, sample.id!="044")$population)
b <- as.numeric(as.factor(subset(tab.juvenile.annot, sample.id!="044")$pCO2.parent))
plot(x.juvenile, y.juvenile, col=a, pch=b, xlab = "", ylab = "",
main = "Multidimensional Scaling Analysis (IBS)", cex=1)
#text(x.juvenile, y.juvenile+.012, labels=tab.juvenile.annot$sample.id, cex=0.7, font=2, col=as.integer(group.juvenile))
#legend("topleft", legend=levels(group.juvenile), pch="o", cex=0.8) #text.col=1:nlevels(group.juvenile),
Fst.all <- rbind(Fst.adults, Fst.larvae, Fst.juvenile)
Fst.means <- Fst.all %>% select(-snp.id) %>% group_by(population, stage) %>% summarize(mean=mean(FstSNP), sd=sd(FstSNP), n=n())
## `summarise()` has grouped output by 'population'. You can override using the `.groups` argument.
AF.all <- rbind(AF.adults, AF.larvae, AF.juveniles)
AF.means <- AF.all %>% select(-snp.id) %>% group_by(population, stage, pCO2) %>% summarize(mean=mean(AlleleFreq), sd=sd(AlleleFreq), n=n())
## `summarise()` has grouped output by 'population', 'stage'. You can override using the `.groups` argument.
ggplot(AF.all, aes(x=stage, y=AlleleFreq, fill=pCO2, alpha=stage)) + geom_violin(trim = T) + theme_minimal() +
ggtitle("Allele Frequency\nBy population, stage, & adult pCO2 treatment") + xlab("") + ylab("Allele Frequency") + facet_wrap(~population) +scale_alpha_discrete(range = c(0.85, 0.15)) #+
## Warning: Using alpha for a discrete variable is not advised.
# g0eom_point(data=AF.all %>% group_by(population, stage, pCO2) %>% summarize(mean=mean(AlleleFreq)),
# aes(x=stage:pCO2, y=mean, label=round(mean, 3)), cex=2.25, inherit.aes=T, position = position_dodge(width=1))
ggplot(Fst.all, aes(x=stage, y=FstSNP, fill=stage)) + geom_violin(trim = T, alpha=0.4) + theme_minimal() + facet_wrap(~ population) +
stat_summary(fun.y=mean, geom="point", aes(group=stage), shape=8, size=2, color="black", fill="black") +
ggtitle("Per-SNP Fst among pCO2 treatments\nBy population and stage") + xlab("") + ylab("Per-Locus Fst") +
geom_text(data=Fst.means, aes(group=stage, y=-.33, label=round(mean, 3)), cex=3.25)
## Warning: `fun.y` is deprecated. Use `fun` instead.
ggplot(Fst.all %>% filter(population==c("FB","DB")), aes(x=FstSNP, fill=stage)) + geom_density(alpha=0.4) + theme_minimal() +
ggtitle("Per-SNP Fst among adult pCO2 treatments (high vs. ambient)\nBy population & stage") + xlab("") + ylab("Per-Locus Fst") + facet_wrap(~population)
# More analysis via vcfR - compare allelic richness and heterozygosity by population, stage and parental pCO2 exposure
First filter the vcf files for each stage (adult, larvae, adult) to remove loci with tons if missing data. Do this via vcftools.
I used this tutorial as an example. - call vcftools - feed it a vcf file after the –vcf flag - Set –max-missing 0.90 to keep only loci with 10% missing genotypes across all individuals - The –recode flag tells the program to write a new vcf file with the filters - The –recode-INFO-all keeps all the INFO flags from the old vcf file in the new one. - Lastly, –out designates the name of the output
# adults
vcftools --gzvcf "../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-adult-filtered-true.vcf.gz" \
--max-missing 0.85 --recode --recode-INFO-all --out \
"../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-adult-4vcfR"
##
## VCFtools - 0.1.17
## (C) Adam Auton and Anthony Marcketta 2009
##
## Parameters as interpreted:
## --gzvcf ../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-adult-filtered-true.vcf.gz
## --recode-INFO-all
## --max-missing 0.85
## --out ../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-adult-4vcfR
## --recode
##
## Using zlib version: 1.2.11
## Warning: Expected at least 2 parts in FORMAT entry: ID=PGT,Number=1,Type=String,Description="Physical phasing haplotype information, describing how the alternate alleles are phased in relation to one another; will always be heterozygous and is not intended to describe called alleles">
## Warning: Expected at least 2 parts in FORMAT entry: ID=PID,Number=1,Type=String,Description="Physical phasing ID information, where each unique ID within a given sample (but not across samples) connects records within a phasing group">
## Warning: Expected at least 2 parts in FORMAT entry: ID=PL,Number=G,Type=Integer,Description="Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification">
## Warning: Expected at least 2 parts in FORMAT entry: ID=RGQ,Number=1,Type=Integer,Description="Unconditional reference genotype confidence, encoded as a phred quality -10*log10 p(genotype call is wrong)">
## Warning: Expected at least 2 parts in INFO entry: ID=AC,Number=A,Type=Integer,Description="Allele count in genotypes, for each ALT allele, in the same order as listed">
## Warning: Expected at least 2 parts in INFO entry: ID=AC,Number=A,Type=Integer,Description="Allele count in genotypes, for each ALT allele, in the same order as listed">
## Warning: Expected at least 2 parts in INFO entry: ID=AF,Number=A,Type=Float,Description="Allele Frequency, for each ALT allele, in the same order as listed">
## Warning: Expected at least 2 parts in INFO entry: ID=AF,Number=A,Type=Float,Description="Allele Frequency, for each ALT allele, in the same order as listed">
## Warning: Expected at least 2 parts in INFO entry: ID=MLEAC,Number=A,Type=Integer,Description="Maximum likelihood expectation (MLE) for the allele counts (not necessarily the same as the AC), for each ALT allele, in the same order as listed">
## Warning: Expected at least 2 parts in INFO entry: ID=MLEAC,Number=A,Type=Integer,Description="Maximum likelihood expectation (MLE) for the allele counts (not necessarily the same as the AC), for each ALT allele, in the same order as listed">
## Warning: Expected at least 2 parts in INFO entry: ID=MLEAF,Number=A,Type=Float,Description="Maximum likelihood expectation (MLE) for the allele frequency (not necessarily the same as the AF), for each ALT allele, in the same order as listed">
## Warning: Expected at least 2 parts in INFO entry: ID=MLEAF,Number=A,Type=Float,Description="Maximum likelihood expectation (MLE) for the allele frequency (not necessarily the same as the AF), for each ALT allele, in the same order as listed">
## After filtering, kept 53 out of 53 Individuals
## Outputting VCF file...
## After filtering, kept 45063 out of a possible 179693 Sites
## Run Time = 8.00 seconds
# larvae
vcftools --gzvcf "../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-larvae-filtered-true.vcf.gz" \
--max-missing 0.85 --recode --recode-INFO-all --out \
"../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-larvae-4vcfR"
##
## VCFtools - 0.1.17
## (C) Adam Auton and Anthony Marcketta 2009
##
## Parameters as interpreted:
## --gzvcf ../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-larvae-filtered-true.vcf.gz
## --recode-INFO-all
## --max-missing 0.85
## --out ../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-larvae-4vcfR
## --recode
##
## Using zlib version: 1.2.11
## Warning: Expected at least 2 parts in FORMAT entry: ID=PGT,Number=1,Type=String,Description="Physical phasing haplotype information, describing how the alternate alleles are phased in relation to one another; will always be heterozygous and is not intended to describe called alleles">
## Warning: Expected at least 2 parts in FORMAT entry: ID=PID,Number=1,Type=String,Description="Physical phasing ID information, where each unique ID within a given sample (but not across samples) connects records within a phasing group">
## Warning: Expected at least 2 parts in FORMAT entry: ID=PL,Number=G,Type=Integer,Description="Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification">
## Warning: Expected at least 2 parts in FORMAT entry: ID=RGQ,Number=1,Type=Integer,Description="Unconditional reference genotype confidence, encoded as a phred quality -10*log10 p(genotype call is wrong)">
## Warning: Expected at least 2 parts in INFO entry: ID=AC,Number=A,Type=Integer,Description="Allele count in genotypes, for each ALT allele, in the same order as listed">
## Warning: Expected at least 2 parts in INFO entry: ID=AC,Number=A,Type=Integer,Description="Allele count in genotypes, for each ALT allele, in the same order as listed">
## Warning: Expected at least 2 parts in INFO entry: ID=AF,Number=A,Type=Float,Description="Allele Frequency, for each ALT allele, in the same order as listed">
## Warning: Expected at least 2 parts in INFO entry: ID=AF,Number=A,Type=Float,Description="Allele Frequency, for each ALT allele, in the same order as listed">
## Warning: Expected at least 2 parts in INFO entry: ID=MLEAC,Number=A,Type=Integer,Description="Maximum likelihood expectation (MLE) for the allele counts (not necessarily the same as the AC), for each ALT allele, in the same order as listed">
## Warning: Expected at least 2 parts in INFO entry: ID=MLEAC,Number=A,Type=Integer,Description="Maximum likelihood expectation (MLE) for the allele counts (not necessarily the same as the AC), for each ALT allele, in the same order as listed">
## Warning: Expected at least 2 parts in INFO entry: ID=MLEAF,Number=A,Type=Float,Description="Maximum likelihood expectation (MLE) for the allele frequency (not necessarily the same as the AF), for each ALT allele, in the same order as listed">
## Warning: Expected at least 2 parts in INFO entry: ID=MLEAF,Number=A,Type=Float,Description="Maximum likelihood expectation (MLE) for the allele frequency (not necessarily the same as the AF), for each ALT allele, in the same order as listed">
## After filtering, kept 76 out of 76 Individuals
## Outputting VCF file...
## After filtering, kept 6029 out of a possible 131140 Sites
## Run Time = 4.00 seconds
# juveniles
vcftools --gzvcf "../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-juvenile-filtered-true.vcf.gz" \
--max-missing 0.85 --recode --recode-INFO-all --out \
"../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-juvenile-4vcfR"
##
## VCFtools - 0.1.17
## (C) Adam Auton and Anthony Marcketta 2009
##
## Parameters as interpreted:
## --gzvcf ../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-juvenile-filtered-true.vcf.gz
## --recode-INFO-all
## --max-missing 0.85
## --out ../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-juvenile-4vcfR
## --recode
##
## Using zlib version: 1.2.11
## Warning: Expected at least 2 parts in FORMAT entry: ID=PGT,Number=1,Type=String,Description="Physical phasing haplotype information, describing how the alternate alleles are phased in relation to one another; will always be heterozygous and is not intended to describe called alleles">
## Warning: Expected at least 2 parts in FORMAT entry: ID=PID,Number=1,Type=String,Description="Physical phasing ID information, where each unique ID within a given sample (but not across samples) connects records within a phasing group">
## Warning: Expected at least 2 parts in FORMAT entry: ID=PL,Number=G,Type=Integer,Description="Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification">
## Warning: Expected at least 2 parts in FORMAT entry: ID=RGQ,Number=1,Type=Integer,Description="Unconditional reference genotype confidence, encoded as a phred quality -10*log10 p(genotype call is wrong)">
## Warning: Expected at least 2 parts in INFO entry: ID=AC,Number=A,Type=Integer,Description="Allele count in genotypes, for each ALT allele, in the same order as listed">
## Warning: Expected at least 2 parts in INFO entry: ID=AC,Number=A,Type=Integer,Description="Allele count in genotypes, for each ALT allele, in the same order as listed">
## Warning: Expected at least 2 parts in INFO entry: ID=AF,Number=A,Type=Float,Description="Allele Frequency, for each ALT allele, in the same order as listed">
## Warning: Expected at least 2 parts in INFO entry: ID=AF,Number=A,Type=Float,Description="Allele Frequency, for each ALT allele, in the same order as listed">
## Warning: Expected at least 2 parts in INFO entry: ID=MLEAC,Number=A,Type=Integer,Description="Maximum likelihood expectation (MLE) for the allele counts (not necessarily the same as the AC), for each ALT allele, in the same order as listed">
## Warning: Expected at least 2 parts in INFO entry: ID=MLEAC,Number=A,Type=Integer,Description="Maximum likelihood expectation (MLE) for the allele counts (not necessarily the same as the AC), for each ALT allele, in the same order as listed">
## Warning: Expected at least 2 parts in INFO entry: ID=MLEAF,Number=A,Type=Float,Description="Maximum likelihood expectation (MLE) for the allele frequency (not necessarily the same as the AF), for each ALT allele, in the same order as listed">
## Warning: Expected at least 2 parts in INFO entry: ID=MLEAF,Number=A,Type=Float,Description="Maximum likelihood expectation (MLE) for the allele frequency (not necessarily the same as the AF), for each ALT allele, in the same order as listed">
## After filtering, kept 16 out of 16 Individuals
## Outputting VCF file...
## After filtering, kept 33681 out of a possible 51818 Sites
## Run Time = 2.00 seconds
require(vcfR)
# Load the data
vcf.adult <- read.vcfR("../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-adult-4vcfR.recode.vcf", verbose = FALSE )
# Confirm order of samples in vcf matches order of samples in the "sample.info" dataframe
colnames(extract.gt(vcf.adult)) == sample.info.adult$sample #yes, all are TRUE
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [16] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [31] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [46] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
# Calculate differentiation / diversity stats for all population + pCO2 groups
myDiff.adult <- genetic_diff(vcf.adult, pops = sample.info.adult$pop_code, method = 'nei')
paste("No. SNPs included in this vcfR analysis, adults ", nrow(myDiff.adult), sep="")
## [1] "No. SNPs included in this vcfR analysis, adults 45063"
# Summarize diversity indices (mean)
knitr::kable(round(colMeans(myDiff.adult[,-1:-2], na.rm = TRUE), digits = 3))
| x | |
|---|---|
| Hs_Dabob Bay-Ambient | 0.128 |
| Hs_Dabob Bay-High | 0.129 |
| Hs_Fidalgo Bay-Ambient | 0.123 |
| Hs_Fidalgo Bay-High | 0.121 |
| Hs_Oyster Bay C1-Ambient | 0.130 |
| Hs_Oyster Bay C1-High | 0.123 |
| Ht | 0.141 |
| n_Dabob Bay-Ambient | 17.007 |
| n_Dabob Bay-High | 14.722 |
| n_Fidalgo Bay-Ambient | 16.442 |
| n_Fidalgo Bay-High | 16.319 |
| n_Oyster Bay C1-Ambient | 17.279 |
| n_Oyster Bay C1-High | 16.834 |
| Gst | 0.087 |
| Htmax | 0.853 |
| Gstmax | 0.857 |
| Gprimest | 0.109 |
# Violin plots of Heterozygosity
dpf.adult <- melt(myDiff.adult[,c(3:8)], varnames=c('Index', 'Sample'), value.name = 'Depth', na.rm=TRUE) %>%
mutate(population=substring(variable, 4)) %>% separate(population, into = c("population", "pCO2"), sep = "-") %>%
mutate(population=as.factor(population), pCO2=as.factor(pCO2))
## No id variables; using all as measure variables
ggplot(dpf.adult, aes(x=population:pCO2, y=Depth)) + geom_violin(aes(fill=population, alpha=pCO2), adjust = 1.2) + xlab("") + ylab("") + theme_bw() +
theme(axis.text.x = element_text(size=7, angle=50, hjust=1)) + ggtitle(paste("Heterozygosity, Adults \nNo. SNPs =", nrow(myDiff.adult))) +
scale_alpha_discrete(range = c(0.35, 0.9)) +
stat_summary(fun.y=mean, geom="point", shape=15, size=3, color="black", fill="black") +
geom_text(data=dpf.adult %>% group_by(population, pCO2) %>% summarize(mean=mean(Depth)), aes(x=population:pCO2, y=-0.05, label=round(mean, 2)), size=3) +
scale_alpha_discrete(guide="none")
## Warning: Using alpha for a discrete variable is not advised.
## Warning: `fun.y` is deprecated. Use `fun` instead.
## `summarise()` has grouped output by 'population'. You can override using the `.groups` argument.
## Warning: Using alpha for a discrete variable is not advised.
## Scale for 'alpha' is already present. Adding another scale for 'alpha', which
## will replace the existing scale.
# Boxplots of allelic richness
dpf.n.adult <- melt(myDiff.adult[,c(10:15)], varnames=c('Index', 'Sample'), value.name = 'Depth', na.rm=TRUE) %>%
mutate(population=substring(variable, 3)) %>% separate(population, into = c("population", "pCO2"), sep = "-") %>%
mutate(population=as.factor(population), pCO2=as.factor(pCO2))
## No id variables; using all as measure variables
ggplot(dpf.n.adult, aes(x=population:pCO2, y=Depth)) + geom_violin(aes(fill=population, alpha=pCO2), adjust = 1.2) + xlab("") + ylab("") + theme_bw() +
theme(axis.text.x = element_text(size=7, angle=50, hjust=1)) + ggtitle(paste("Allelic Richness, Adults \nNo. SNPs =", nrow(myDiff.adult))) +
scale_alpha_discrete(range = c(0.35, 0.9)) +
stat_summary(fun.y=mean, geom="point", shape=15, size=3, color="black", fill="black") +
geom_text(data=dpf.n.adult %>% group_by(population, pCO2) %>% summarize(mean=mean(Depth)), aes(x=population:pCO2, y=18.5, label=round(mean, 2)), size=3) +
scale_alpha_discrete(guide="none")
## Warning: Using alpha for a discrete variable is not advised.
## Warning: `fun.y` is deprecated. Use `fun` instead.
## `summarise()` has grouped output by 'population'. You can override using the `.groups` argument.
## Warning: Using alpha for a discrete variable is not advised.
## Scale for 'alpha' is already present. Adding another scale for 'alpha', which
## will replace the existing scale.
# Load the data
vcf.larvae <- read.vcfR("../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-larvae-4vcfR.recode.vcf", verbose = FALSE )
# Write script to order sample.info dataframe to have same order as vcf
colnames(extract.gt(vcf.larvae)) == sample.info.larvae[match(colnames(extract.gt(vcf.larvae)), sample.info.larvae$sample),]$sample #yes, all are TRUE
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [16] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [31] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [46] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [61] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [76] TRUE
# Calculate differentiation / diversity stats for all population + pCO2 groups
myDiff.larvae <- genetic_diff(vcf.larvae, pops = sample.info.larvae[match(colnames(extract.gt(vcf.larvae)), sample.info.larvae$sample),]$pop_code, method = 'nei')
paste("No. SNPs included in this vcfR analysis, juveniles ", nrow(myDiff.larvae), sep="")
## [1] "No. SNPs included in this vcfR analysis, juveniles 6029"
# Summarize diversity indices (mean)
knitr::kable(round(colMeans(myDiff.larvae[,-1:-2], na.rm = TRUE), digits = 3))
| x | |
|---|---|
| Hs_Dabob Bay-Ambient | 0.081 |
| Hs_Dabob Bay-High | 0.082 |
| Hs_Fidalgo Bay-Ambient | 0.084 |
| Hs_Fidalgo Bay-High | 0.090 |
| Hs_Oyster Bay C1-Ambient | 0.068 |
| Hs_Oyster Bay C1-High | 0.069 |
| Hs_Oyster Bay C2-Ambient | 0.068 |
| Hs_Oyster Bay C2-High | 0.063 |
| Ht | 0.081 |
| n_Dabob Bay-Ambient | 9.268 |
| n_Dabob Bay-High | 9.608 |
| n_Fidalgo Bay-Ambient | 11.806 |
| n_Fidalgo Bay-High | 19.365 |
| n_Oyster Bay C1-Ambient | 27.795 |
| n_Oyster Bay C1-High | 28.190 |
| n_Oyster Bay C2-Ambient | 13.528 |
| n_Oyster Bay C2-High | 14.253 |
| Gst | 0.082 |
| Htmax | 0.862 |
| Gstmax | 0.916 |
| Gprimest | 0.092 |
# Violin plots of Heterozygosity
dpf.larvae <- melt(myDiff.larvae[,c(3:10)], varnames=c('Index', 'Sample'), value.name = 'Depth', na.rm=TRUE) %>%
mutate(population=substring(variable, 4)) %>% separate(population, into = c("population", "pCO2"), sep = "-") %>%
mutate(population=as.factor(population), pCO2=as.factor(pCO2))
## No id variables; using all as measure variables
ggplot(dpf.larvae, aes(x=population:pCO2, y=Depth)) + geom_violin(aes(fill=population, alpha=pCO2), adjust = 1.2) + xlab("") + ylab("") + theme_bw() +
theme(axis.text.x = element_text(size=7, angle=50, hjust=1)) + ggtitle(paste("Heterozygosity, Larvae \nNo. SNPs =", nrow(myDiff.larvae))) +
scale_alpha_discrete(range = c(0.35, 0.9)) +
stat_summary(fun.y=mean, geom="point", shape=15, size=3, color="black", fill="black") +
geom_text(data=dpf.larvae %>% group_by(population, pCO2) %>% summarize(mean=mean(Depth)), aes(x=population:pCO2, y=-0.05, label=round(mean, 2)), size=3) +
scale_alpha_discrete(guide="none")
## Warning: Using alpha for a discrete variable is not advised.
## Warning: `fun.y` is deprecated. Use `fun` instead.
## `summarise()` has grouped output by 'population'. You can override using the `.groups` argument.
## Warning: Using alpha for a discrete variable is not advised.
## Scale for 'alpha' is already present. Adding another scale for 'alpha', which
## will replace the existing scale.
# Boxplots of allelic richness
dpf.n.larvae <- melt(myDiff.larvae[,c(12:19)], varnames=c('Index', 'Sample'), value.name = 'Depth', na.rm=TRUE) %>%
mutate(population=substring(variable, 3)) %>% separate(population, into = c("population", "pCO2"), sep = "-") %>%
mutate(population=as.factor(population), pCO2=as.factor(pCO2))
## No id variables; using all as measure variables
ggplot(dpf.n.larvae, aes(x=population:pCO2, y=Depth)) + geom_violin(aes(fill=population, alpha=pCO2), adjust = 1.2) + xlab("") + ylab("") + theme_bw() +
theme(axis.text.x = element_text(size=7, angle=50, hjust=1)) + ggtitle(paste("Allelic Richness, Larvae \nNo. SNPs =", nrow(myDiff.larvae))) +
scale_alpha_discrete(range = c(0.35, 0.9)) +
stat_summary(fun.y=mean, geom="point", shape=15, size=3, color="black", fill="black") +
geom_text(data=dpf.n.larvae %>% group_by(population, pCO2) %>% summarize(mean=mean(Depth)), aes(x=population:pCO2, y=3, label=round(mean, 2)), size=3) +
scale_alpha_discrete(guide="none")
## Warning: Using alpha for a discrete variable is not advised.
## Warning: `fun.y` is deprecated. Use `fun` instead.
## `summarise()` has grouped output by 'population'. You can override using the `.groups` argument.
## Warning: Using alpha for a discrete variable is not advised.
## Scale for 'alpha' is already present. Adding another scale for 'alpha', which
## will replace the existing scale.
# 1-yr old “juveniles”
# Load the data
vcf.juvenile <- read.vcfR("../qc-processing/gatk/Olurida_QuantSeq2020_genotypes-juvenile-4vcfR.recode.vcf", verbose = FALSE )
# Confirm order of samples in vcf matches order of samples in the "sample.info" dataframe
colnames(extract.gt(vcf.juvenile)) == sample.info.juvenile$sample #yes, all are TRUE
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [16] TRUE
# Calculate differentiation / diversity stats for all population + pCO2 groups
myDiff.juvenile <- genetic_diff(vcf.juvenile, pops = sample.info.juvenile$pop_code, method = 'nei')
paste("No. SNPs included in this vcfR analysis, juveniles ", nrow(myDiff.juvenile), sep="")
## [1] "No. SNPs included in this vcfR analysis, juveniles 33681"
# Summarize diversity indices (mean)
knitr::kable(round(colMeans(myDiff.juvenile[,-1:-2], na.rm = TRUE), digits = 3))
| x | |
|---|---|
| Hs_Dabob Bay-Ambient | 0.189 |
| Hs_Dabob Bay-High | 0.197 |
| Hs_Fidalgo Bay-Ambient | 0.198 |
| Hs_Fidalgo Bay-High | 0.195 |
| Ht | 0.246 |
| n_Dabob Bay-Ambient | 7.637 |
| n_Dabob Bay-High | 7.838 |
| n_Fidalgo Bay-Ambient | 7.741 |
| n_Fidalgo Bay-High | 7.425 |
| Gst | 0.185 |
| Htmax | 0.797 |
| Gstmax | 0.762 |
| Gprimest | 0.251 |
# Violin plots of Heterozygosity
dpf.juvenile <- melt(myDiff.juvenile[,c(3:6)], varnames=c('Index', 'Sample'), value.name = 'Depth', na.rm=TRUE) %>%
mutate(population=substring(variable, 4)) %>% separate(population, into = c("population", "pCO2"), sep = "-") %>%
mutate(population=as.factor(population), pCO2=as.factor(pCO2))
## No id variables; using all as measure variables
ggplot(dpf.juvenile, aes(x=population:pCO2, y=Depth)) + geom_violin(aes(fill=population, alpha=pCO2), adjust = 1.2) + xlab("") + ylab("") + theme_bw() +
theme(axis.text.x = element_text(size=7, angle=50, hjust=1)) + ggtitle(paste("Heterozygosity, Juveniles \nNo. SNPs =", nrow(myDiff.juvenile))) +
scale_alpha_discrete(range = c(0.35, 0.9)) +
stat_summary(fun.y=mean, geom="point", shape=15, size=3, color="black", fill="black") +
geom_text(data=dpf.juvenile %>% group_by(population, pCO2) %>% summarize(mean=mean(Depth)), aes(x=population:pCO2, y=-0.05, label=round(mean, 2)), size=3) +
scale_alpha_discrete(guide="none")
## Warning: Using alpha for a discrete variable is not advised.
## Warning: `fun.y` is deprecated. Use `fun` instead.
## `summarise()` has grouped output by 'population'. You can override using the `.groups` argument.
## Warning: Using alpha for a discrete variable is not advised.
## Scale for 'alpha' is already present. Adding another scale for 'alpha', which
## will replace the existing scale.
# Boxplots of allelic richness
dpf.n.juvenile <- melt(myDiff.juvenile[,c(8:11)], varnames=c('Index', 'Sample'), value.name = 'Depth', na.rm=TRUE) %>%
mutate(population=substring(variable, 3)) %>% separate(population, into = c("population", "pCO2"), sep = "-") %>%
mutate(population=as.factor(population), pCO2=as.factor(pCO2))
## No id variables; using all as measure variables
ggplot(dpf.n.juvenile, aes(x=population:pCO2, y=Depth)) + geom_violin(aes(fill=population, alpha=pCO2), adjust = 1.2) + xlab("") + ylab("") + theme_bw() +
theme(axis.text.x = element_text(size=7, angle=50, hjust=1)) + ggtitle(paste("Allelic Richness, Juveniles \nNo. SNPs =", nrow(myDiff.juvenile))) +
scale_alpha_discrete(range = c(0.35, 0.9)) +
stat_summary(fun.y=mean, geom="point", shape=15, size=3, color="black", fill="black") +
geom_text(data=dpf.n.juvenile %>% group_by(population, pCO2) %>% summarize(mean=mean(Depth)), aes(x=population:pCO2, y=8.2, label=round(mean, 2)), size=3) +
scale_alpha_discrete(guide="none")
## Warning: Using alpha for a discrete variable is not advised.
## Warning: `fun.y` is deprecated. Use `fun` instead.
## `summarise()` has grouped output by 'population'. You can override using the `.groups` argument.
## Warning: Using alpha for a discrete variable is not advised.
## Scale for 'alpha' is already present. Adding another scale for 'alpha', which
## will replace the existing scale.
myDiff.AL <- inner_join(myDiff.adult, myDiff.larvae, by=c("CHROM", "POS")) #7,202 loci common among all
paste("No. SNPs shared among adults and larvae: ", nrow(myDiff.AL), sep="")
## [1] "No. SNPs shared among adults and larvae: 1397"
# Compare adults vs. larvae
knitr::kable(round(colMeans(inner_join(myDiff.adult, myDiff.larvae, by=c("CHROM", "POS"))[,-1:-2], na.rm = TRUE), digits = 3))
| x | |
|---|---|
| Hs_Dabob Bay-Ambient.x | 0.135 |
| Hs_Dabob Bay-High.x | 0.118 |
| Hs_Fidalgo Bay-Ambient.x | 0.111 |
| Hs_Fidalgo Bay-High.x | 0.117 |
| Hs_Oyster Bay C1-Ambient.x | 0.115 |
| Hs_Oyster Bay C1-High.x | 0.106 |
| Ht.x | 0.132 |
| n_Dabob Bay-Ambient.x | 17.311 |
| n_Dabob Bay-High.x | 14.454 |
| n_Fidalgo Bay-Ambient.x | 16.500 |
| n_Fidalgo Bay-High.x | 16.839 |
| n_Oyster Bay C1-Ambient.x | 17.270 |
| n_Oyster Bay C1-High.x | 17.092 |
| Gst.x | 0.094 |
| Htmax.x | 0.851 |
| Gstmax.x | 0.866 |
| Gprimest.x | 0.114 |
| Hs_Dabob Bay-Ambient.y | 0.111 |
| Hs_Dabob Bay-High.y | 0.111 |
| Hs_Fidalgo Bay-Ambient.y | 0.110 |
| Hs_Fidalgo Bay-High.y | 0.111 |
| Hs_Oyster Bay C1-Ambient.y | 0.096 |
| Hs_Oyster Bay C1-High.y | 0.095 |
| Hs_Oyster Bay C2-Ambient | 0.090 |
| Hs_Oyster Bay C2-High | 0.088 |
| Ht.y | 0.109 |
| n_Dabob Bay-Ambient.y | 9.244 |
| n_Dabob Bay-High.y | 9.618 |
| n_Fidalgo Bay-Ambient.y | 11.810 |
| n_Fidalgo Bay-High.y | 19.373 |
| n_Oyster Bay C1-Ambient.y | 27.993 |
| n_Oyster Bay C1-High.y | 28.306 |
| n_Oyster Bay C2-Ambient | 13.489 |
| n_Oyster Bay C2-High | 14.109 |
| Gst.y | 0.090 |
| Htmax.y | 0.865 |
| Gstmax.y | 0.888 |
| Gprimest.y | 0.103 |
# Heterozygosity
dpf.AL <-
melt(inner_join(myDiff.adult, myDiff.larvae, by=c("CHROM", "POS")) [,c(3:8,20:27)],
varnames=c('Index', 'Sample'), value.name = 'Depth', na.rm=TRUE) %>%
mutate(variable=gsub("\\.x", ".adult", variable)) %>%
mutate(variable=gsub("\\.y", ".larvae", variable)) %>%
mutate(population=substring(variable, 4)) %>% separate(population, into = c("population", "pCO2", "stage"), sep = '[-.]') %>%
mutate_each_(funs(factor(.)),c("population", "pCO2", "stage"))
## Warning: `mutate_each_()` was deprecated in dplyr 0.7.0.
## Please use `across()` instead.
## No id variables; using all as measure variables
## Warning: Expected 3 pieces. Missing pieces filled with `NA` in 2794 rows [16765,
## 16766, 16767, 16768, 16769, 16770, 16771, 16772, 16773, 16774, 16775, 16776,
## 16777, 16778, 16779, 16780, 16781, 16782, 16783, 16784, ...].
## Warning: `funs()` was deprecated in dplyr 0.8.0.
## Please use a list of either functions or lambdas:
##
## # Simple named list:
## list(mean = mean, median = median)
##
## # Auto named with `tibble::lst()`:
## tibble::lst(mean, median)
##
## # Using lambdas
## list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
ggplot(subset(dpf.AL, population!="Oyster Bay C2")%>%droplevels(), aes(x=population:stage:pCO2, y=Depth)) +
geom_violin(aes(fill=population, alpha=pCO2, col=stage), adjust = 1.2) + xlab("") + ylab("") + theme_bw() +
theme(axis.text.x = element_text(size=7, angle=50, hjust=1)) + ggtitle(paste("Heterozygosity, Adults & Larvae\nNo. shared SNPs =", nrow(myDiff.AL))) +
scale_alpha_discrete(range = c(0.35, 0.9)) +
stat_summary(fun.y=mean, geom="point", shape=15, size=3, color="black", fill="black") +
geom_text(data=dpf.AL %>% group_by(population, stage, pCO2) %>% summarize(mean=mean(Depth)), aes(x=population:stage:pCO2, y=-0.05, label=round(mean, 2)), size=3) +
scale_colour_manual(values=c("black", "dodgerblue3")) +
scale_alpha_discrete(guide="none")
## Warning: Using alpha for a discrete variable is not advised.
## Warning: `fun.y` is deprecated. Use `fun` instead.
## `summarise()` has grouped output by 'population', 'stage'. You can override using the `.groups` argument.
## Warning: Using alpha for a discrete variable is not advised.
## Scale for 'alpha' is already present. Adding another scale for 'alpha', which
## will replace the existing scale.
## Warning: Removed 2 rows containing missing values (geom_text).
# Allelic Richness
dpf.n.AL <- melt(inner_join(myDiff.adult, myDiff.larvae, by=c("CHROM", "POS")) [,c(10:15,29:36)],
varnames=c('Index', 'Sample'), value.name = 'Depth', na.rm=TRUE) %>%
mutate(variable=gsub("\\.x", ".adult", variable)) %>%
mutate(variable=gsub("\\.y", ".larvae", variable)) %>%
mutate(population=substring(variable, 3)) %>% separate(population, into = c("population", "pCO2", "stage"), sep = '[-.]') %>%
mutate_each_(funs(factor(.)),c("population", "pCO2", "stage"))
## No id variables; using all as measure variables
## Warning: Expected 3 pieces. Missing pieces filled with `NA` in 2794 rows [16765,
## 16766, 16767, 16768, 16769, 16770, 16771, 16772, 16773, 16774, 16775, 16776,
## 16777, 16778, 16779, 16780, 16781, 16782, 16783, 16784, ...].
ggplot(subset(dpf.n.AL, population!="Oyster Bay C2")%>%droplevels(), aes(x=population:stage:pCO2, y=Depth)) +
geom_violin(aes(fill=population, alpha=pCO2, col=stage), width=1) + xlab("") + ylab("") + theme_bw() +
theme(axis.text.x = element_text(size=7, angle=50, hjust=1.3)) + ggtitle(paste("Allelic Richness, Adults & Larvae\nNo. shared SNPs =", nrow(myDiff.AL))) +
scale_alpha_discrete(range = c(0.35, 0.9)) +
stat_summary(fun.y=mean, geom="point", shape=15, size=3, color="black", fill="black") +
geom_text(data=dpf.n.AL %>% group_by(population, stage, pCO2) %>% summarize(mean=mean(Depth)), aes(x=population:stage:pCO2, y=2, label=round(mean, 2)), size=3) +
scale_colour_manual(values=c("black", "dodgerblue3")) +
scale_alpha_discrete(guide="none")
## Warning: Using alpha for a discrete variable is not advised.
## Warning: `fun.y` is deprecated. Use `fun` instead.
## `summarise()` has grouped output by 'population', 'stage'. You can override using the `.groups` argument.
## Warning: Using alpha for a discrete variable is not advised.
## Scale for 'alpha' is already present. Adding another scale for 'alpha', which
## will replace the existing scale.
## Warning: Removed 2 rows containing missing values (geom_text).
myDiff.AJ <- inner_join(myDiff.adult, myDiff.juvenile, by=c("CHROM", "POS"))
paste("No. SNPs shared among adults and juveniles: ", nrow(myDiff.AJ), sep="")
## [1] "No. SNPs shared among adults and juveniles: 4352"
# Compare adults vs. larvae
knitr::kable(round(colMeans(myDiff.AJ[,-1:-2], na.rm = TRUE), digits = 3))
| x | |
|---|---|
| Hs_Dabob Bay-Ambient.x | 0.203 |
| Hs_Dabob Bay-High.x | 0.198 |
| Hs_Fidalgo Bay-Ambient.x | 0.192 |
| Hs_Fidalgo Bay-High.x | 0.200 |
| Hs_Oyster Bay C1-Ambient | 0.192 |
| Hs_Oyster Bay C1-High | 0.184 |
| Ht.x | 0.219 |
| n_Dabob Bay-Ambient.x | 17.248 |
| n_Dabob Bay-High.x | 14.235 |
| n_Fidalgo Bay-Ambient.x | 16.196 |
| n_Fidalgo Bay-High.x | 16.919 |
| n_Oyster Bay C1-Ambient | 17.188 |
| n_Oyster Bay C1-High | 17.011 |
| Gst.x | 0.099 |
| Htmax.x | 0.864 |
| Gstmax.x | 0.779 |
| Gprimest.x | 0.135 |
| Hs_Dabob Bay-Ambient.y | 0.182 |
| Hs_Dabob Bay-High.y | 0.190 |
| Hs_Fidalgo Bay-Ambient.y | 0.200 |
| Hs_Fidalgo Bay-High.y | 0.188 |
| Ht.y | 0.243 |
| n_Dabob Bay-Ambient.y | 7.515 |
| n_Dabob Bay-High.y | 7.822 |
| n_Fidalgo Bay-Ambient.y | 7.842 |
| n_Fidalgo Bay-High.y | 7.394 |
| Gst.y | 0.193 |
| Htmax.y | 0.795 |
| Gstmax.y | 0.767 |
| Gprimest.y | 0.259 |
# Heterozygosity
dpf.AJ <- melt(select(myDiff.AJ,contains("Hs")),
varnames=c('Index', 'Sample'), value.name = 'Depth', na.rm=TRUE) %>%
mutate(variable=gsub("\\.x", ".adult", variable)) %>%
mutate(variable=gsub("\\.y", ".juvenile", variable)) %>%
mutate(population=substring(variable, 4)) %>% separate(population, into = c("population", "pCO2", "stage"), sep = '[-.]') %>%
mutate_each_(funs(factor(.)),c("population", "pCO2", "stage"))
## No id variables; using all as measure variables
## Warning: Expected 3 pieces. Missing pieces filled with `NA` in 8704 rows [17409,
## 17410, 17411, 17412, 17413, 17414, 17415, 17416, 17417, 17418, 17419, 17420,
## 17421, 17422, 17423, 17424, 17425, 17426, 17427, 17428, ...].
ggplot(filter(dpf.AJ, !grepl('Oyster', population))%>%droplevels(), aes(x=population:stage:pCO2, y=Depth)) +
geom_violin(aes(fill=population, alpha=pCO2, col=stage), adjust = 1.2) + xlab("") + ylab("") + theme_bw() +
theme(axis.text.x = element_text(size=7, angle=50, hjust=1)) + ggtitle(paste("Heterozygosity, Adults & Juveniles\nNo. shared SNPs =", nrow(myDiff.AJ))) +
scale_alpha_discrete(range = c(0.35, 0.9)) +
stat_summary(fun.y=mean, geom="point", shape=15, size=3, color="black", fill="black") +
geom_text(data=filter(dpf.AJ, !grepl('Oyster', population))%>%droplevels() %>% group_by(population, stage, pCO2) %>% summarize(mean=mean(Depth)), aes(x=population:stage:pCO2, y=-0.05, label=round(mean, 2)), size=3) +
scale_colour_manual(values=c("black", "darkred")) +
scale_alpha_discrete(guide="none")
## Warning: Using alpha for a discrete variable is not advised.
## Warning: `fun.y` is deprecated. Use `fun` instead.
## `summarise()` has grouped output by 'population', 'stage'. You can override using the `.groups` argument.
## Warning: Using alpha for a discrete variable is not advised.
## Scale for 'alpha' is already present. Adding another scale for 'alpha', which
## will replace the existing scale.
# Allelic Richness
dpf.n.AJ <- melt(select(myDiff.AJ,contains("n_")),
varnames=c('Index', 'Sample'), value.name = 'Depth', na.rm=TRUE) %>%
mutate(variable=gsub("\\.x", ".adult", variable)) %>%
mutate(variable=gsub("\\.y", ".juvenile", variable)) %>%
mutate(population=substring(variable, 3)) %>% separate(population, into = c("population", "pCO2", "stage"), sep = '[-.]') %>%
mutate_each_(funs(factor(.)),c("population", "pCO2", "stage"))
## No id variables; using all as measure variables
## Warning: Expected 3 pieces. Missing pieces filled with `NA` in 8704 rows [17409,
## 17410, 17411, 17412, 17413, 17414, 17415, 17416, 17417, 17418, 17419, 17420,
## 17421, 17422, 17423, 17424, 17425, 17426, 17427, 17428, ...].
ggplot(filter(dpf.n.AJ, !grepl('Oyster', population))%>%droplevels(), aes(x=population:stage:pCO2, y=Depth)) +
geom_violin(aes(fill=population, alpha=pCO2, col=stage), adjust = 1.2) + xlab("") + ylab("") + theme_bw() +
theme(axis.text.x = element_text(size=7, angle=50, hjust=1)) + ggtitle(paste("Heterozygosity, Adults & Juveniles\nNo. shared SNPs =", nrow(myDiff.AJ))) +
scale_alpha_discrete(range = c(0.35, 0.9)) +
stat_summary(fun.y=mean, geom="point", shape=15, size=3, color="black", fill="black") +
geom_text(data=filter(dpf.n.AJ, !grepl('Oyster', population))%>%droplevels() %>% group_by(population, stage, pCO2) %>% summarize(mean=mean(Depth)), aes(x=population:stage:pCO2, y=-0.05, label=round(mean, 2)), size=3) +
scale_colour_manual(values=c("black", "darkred")) +
scale_alpha_discrete(guide="none")
## Warning: Using alpha for a discrete variable is not advised.
## Warning: `fun.y` is deprecated. Use `fun` instead.
## `summarise()` has grouped output by 'population', 'stage'. You can override using the `.groups` argument.
## Warning: Using alpha for a discrete variable is not advised.
## Scale for 'alpha' is already present. Adding another scale for 'alpha', which
## will replace the existing scale.
myDiff.ALJ <- inner_join(inner_join(myDiff.adult, myDiff.larvae, by=c("CHROM", "POS")), myDiff.juvenile, by=c("CHROM", "POS"))
paste("No. SNPs shared among adults, larvae, and juveniles: ", nrow(myDiff.ALJ), sep="")
## [1] "No. SNPs shared among adults, larvae, and juveniles: 412"
# Compare adults vs. larvae
knitr::kable(round(colMeans(myDiff.ALJ[,-1:-2], na.rm = TRUE), digits = 3))
| x | |
|---|---|
| Hs_Dabob Bay-Ambient.x | 0.188 |
| Hs_Dabob Bay-High.x | 0.169 |
| Hs_Fidalgo Bay-Ambient.x | 0.142 |
| Hs_Fidalgo Bay-High.x | 0.150 |
| Hs_Oyster Bay C1-Ambient.x | 0.141 |
| Hs_Oyster Bay C1-High.x | 0.130 |
| Ht.x | 0.173 |
| n_Dabob Bay-Ambient.x | 17.398 |
| n_Dabob Bay-High.x | 14.296 |
| n_Fidalgo Bay-Ambient.x | 16.379 |
| n_Fidalgo Bay-High.x | 16.995 |
| n_Oyster Bay C1-Ambient.x | 17.194 |
| n_Oyster Bay C1-High.x | 17.121 |
| Gst.x | 0.102 |
| Htmax.x | 0.857 |
| Gstmax.x | 0.825 |
| Gprimest.x | 0.130 |
| Hs_Dabob Bay-Ambient.y | 0.150 |
| Hs_Dabob Bay-High.y | 0.154 |
| Hs_Fidalgo Bay-Ambient.y | 0.151 |
| Hs_Fidalgo Bay-High.y | 0.166 |
| Hs_Oyster Bay C1-Ambient.y | 0.117 |
| Hs_Oyster Bay C1-High.y | 0.115 |
| Hs_Oyster Bay C2-Ambient | 0.114 |
| Hs_Oyster Bay C2-High | 0.110 |
| Ht.y | 0.143 |
| n_Dabob Bay-Ambient.y | 9.194 |
| n_Dabob Bay-High.y | 9.476 |
| n_Fidalgo Bay-Ambient.y | 12.010 |
| n_Fidalgo Bay-High.y | 19.427 |
| n_Oyster Bay C1-Ambient.y | 27.607 |
| n_Oyster Bay C1-High.y | 28.364 |
| n_Oyster Bay C2-Ambient | 13.495 |
| n_Oyster Bay C2-High | 14.689 |
| Gst.y | 0.097 |
| Htmax.y | 0.870 |
| Gstmax.y | 0.853 |
| Gprimest.y | 0.115 |
| Hs_Dabob Bay-Ambient | 0.158 |
| Hs_Dabob Bay-High | 0.192 |
| Hs_Fidalgo Bay-Ambient | 0.170 |
| Hs_Fidalgo Bay-High | 0.150 |
| Ht | 0.214 |
| n_Dabob Bay-Ambient | 7.379 |
| n_Dabob Bay-High | 7.709 |
| n_Fidalgo Bay-Ambient | 7.854 |
| n_Fidalgo Bay-High | 7.345 |
| Gst | 0.193 |
| Htmax | 0.789 |
| Gstmax | 0.793 |
| Gprimest | 0.249 |
# Heterozygosity
dpf.ALJ <- melt(select(myDiff.ALJ,contains("Hs")) %>% rename_at(vars(`Hs_Dabob Bay-Ambient`:`Hs_Fidalgo Bay-High`), paste0, ".z"),
varnames=c('Index', 'Sample'), value.name = 'Depth', na.rm=TRUE) %>%
mutate(variable=gsub("\\.x", ".adult", variable)) %>%
mutate(variable=gsub("\\.y", ".larvae", variable)) %>%
mutate(variable=gsub("\\.z", ".juvenile", variable)) %>%
mutate(population=substring(variable, 4)) %>% separate(population, into = c("population", "pCO2", "stage"), sep = '[-.]') %>%
mutate_each_(funs(factor(.)),c("population", "pCO2", "stage")) %>%
mutate(stage=fct_relevel(stage, c("adult", "larvae", "juvenile")))
## No id variables; using all as measure variables
## Warning: Expected 3 pieces. Missing pieces filled with `NA` in 824 rows [4945,
## 4946, 4947, 4948, 4949, 4950, 4951, 4952, 4953, 4954, 4955, 4956, 4957, 4958,
## 4959, 4960, 4961, 4962, 4963, 4964, ...].
ggplot(subset(dpf.ALJ, population!="Oyster Bay C2")%>% droplevels(), aes(x=population:stage:pCO2, y=Depth)) +
geom_violin(aes(fill=stage, alpha=pCO2), adjust = 1.2) + xlab("Parental pCO2 exposure") + ylab("") + theme_bw() +
theme(axis.text.x = element_text(size=7, angle=50, hjust=1)) + ggtitle(paste("Heterozygosity: Adults, Larvae, and Juveniles\nNo. shared SNPs =", nrow(myDiff.ALJ))) +
scale_alpha_discrete(range = c(0.35, 0.9)) +
stat_summary(fun.y=mean, geom="point", shape=15, size=3, color="black", fill="black") +
geom_text(data=subset(dpf.ALJ, population!="Oyster Bay C2")%>%droplevels() %>% group_by(population, stage, pCO2) %>% summarize(mean=mean(Depth)),
aes(x=population:stage:pCO2, y=-0.05, label=round(mean, 2)), size=3) +
scale_alpha_discrete(guide="none") +
annotate("text", x = 3, y = .6, label = "Dabob Bay", size=3.5) +
annotate("text", x = 9, y = .6, label = "Fidalgo Bay", size=3.5) +
annotate("text", x = 14.5, y = .6, label = "Oyster Bay", size=3.5) +
geom_vline(xintercept=6.5) + geom_vline(xintercept=12.5) +
scale_x_discrete(labels=rep(c("Ambient", "High"), times=8))
## Warning: Using alpha for a discrete variable is not advised.
## Warning: `fun.y` is deprecated. Use `fun` instead.
## `summarise()` has grouped output by 'population', 'stage'. You can override using the `.groups` argument.
## Warning: Using alpha for a discrete variable is not advised.
## Scale for 'alpha' is already present. Adding another scale for 'alpha', which
## will replace the existing scale.
# Allelic Richness
dpf.n.ALJ <- melt(select(myDiff.ALJ,contains("n_")) %>% rename_at(vars(`n_Dabob Bay-Ambient`:`n_Fidalgo Bay-High`), paste0, ".z"),
varnames=c('Index', 'Sample'), value.name = 'Depth', na.rm=TRUE) %>%
mutate(variable=gsub("\\.x", ".adult", variable)) %>%
mutate(variable=gsub("\\.y", ".larvae", variable)) %>%
mutate(variable=gsub("\\.z", ".juvenile", variable)) %>%
mutate(population=substring(variable, 3)) %>% separate(population, into = c("population", "pCO2", "stage"), sep = '[-.]') %>%
mutate_each_(funs(factor(.)),c("population", "pCO2", "stage")) %>%
mutate(stage=fct_relevel(stage, c("adult", "larvae", "juvenile")))
## No id variables; using all as measure variables
## Warning: Expected 3 pieces. Missing pieces filled with `NA` in 824 rows [4945,
## 4946, 4947, 4948, 4949, 4950, 4951, 4952, 4953, 4954, 4955, 4956, 4957, 4958,
## 4959, 4960, 4961, 4962, 4963, 4964, ...].
ggplot(subset(dpf.n.ALJ, population!="Oyster Bay C2")%>%droplevels(), aes(x=population:stage:pCO2, y=Depth)) +
geom_violin(aes(fill=stage, alpha=pCO2),width=1.3) + xlab("Parental pCO2 exposure") + ylab("") + theme_bw() +
theme(axis.text.x = element_text(size=7, angle=50, hjust=1)) + ggtitle(paste("Allelic Richness: Adults, Larvae, and Juveniles\nNo. shared SNPs =", nrow(myDiff.ALJ))) +
scale_alpha_discrete(range = c(0.35, 0.9)) +
#stat_summary(fun.y=mean, geom="point", shape=15, size=2, color="black", fill="black") +
geom_text(data=subset(dpf.n.ALJ, population!="Oyster Bay C2")%>%droplevels() %>% group_by(population, stage, pCO2) %>% summarize(mean=mean(Depth)),
aes(x=population:stage:pCO2, y=3, label=round(mean, 2)), size=2.5) +
#scale_colour_manual(values=c("black", "dodgerblue3", "darkred")) +
scale_alpha_discrete(guide="none") + geom_vline(xintercept=6.5) + geom_vline(xintercept=12.5) +
annotate("text", x = 3, y = 30, label = "Dabob Bay", size=3.5) +
annotate("text", x = 9, y = 30, label = "Fidalgo Bay", size=3.5) +
annotate("text", x = 13.5, y = 30, label = "Oyster\nBay", size=3.5) +
scale_x_discrete(labels=rep(c("Ambient", "High"), times=8))
## Warning: Using alpha for a discrete variable is not advised.
## `summarise()` has grouped output by 'population', 'stage'. You can override using the `.groups` argument.
## Warning: Using alpha for a discrete variable is not advised.
## Scale for 'alpha' is already present. Adding another scale for 'alpha', which
## will replace the existing scale.
## Warning: position_dodge requires non-overlapping x intervals
I will use Colony to identify siblings in my samples. The Colony software is only available via the command line for Mac users (it’s typically used via GUI on Windows). To get the most current version of the program I actually had to email the developer. He send me a link to download it. I’m running Colony Version 2.0.6.6 (June 30, 2020), which is saved on my computer and in this project’s repo (references/colony2.mac.20200904).
Prepare files for Colony sibship analysis Colony identifies siblings, which I would like to do with my juveniles. Here I prepare files for Colony.
Mac suggested that I only keep SNPs with very high minor allele frequencies. Here I use SNPRelate to export SNPs with MAF >=2%.
snpset.adult.4Colony <- snpgdsLDpruning(genofile.adult, maf=.2, missing.rate=0, autosome.only=FALSE)
## SNP pruning based on LD:
## Excluding 174,997 SNPs (monomorphic: TRUE, MAF: 0.2, missing rate: 0)
## # of samples: 53
## # of SNPs: 246
## using 1 thread
## sliding window: 500,000 basepairs, Inf SNPs
## |LD| threshold: 0.2
## method: composite
## Chromosome Contig0_5313: 0.11%, 23/21,863
## Chromosome Contig104153_109739: 0.08%, 2/2,360
## Chromosome Contig109741_116186: 0.16%, 3/1,861
## Chromosome Contig11179_21262: 0.16%, 25/15,668
## Chromosome Contig116188_123130: 0.06%, 1/1,794
## Chromosome Contig123131_129982: 0.06%, 1/1,787
## Chromosome Contig129983_136896: 0.06%, 1/1,798
## Chromosome Contig136897_143777: 0.05%, 1/1,887
## Chromosome Contig150711_166372: 0.13%, 2/1,485
## Chromosome Contig166374_181896: 0.11%, 2/1,875
## Chromosome Contig181898_197428: 0.13%, 2/1,516
## Chromosome Contig21263_28135: 0.15%, 27/18,170
## Chromosome Contig213221_398012: 0.16%, 3/1,819
## Chromosome Contig28136_34745: 0.12%, 19/15,929
## Chromosome Contig34748_41067: 0.20%, 27/13,582
## Chromosome Contig398123_696856: 0.17%, 3/1,721
## Chromosome Contig41068_47193: 0.08%, 9/11,130
## Chromosome Contig47194_53180: 0.07%, 6/9,167
## Chromosome Contig5314_11178: 0.07%, 4/5,362
## Chromosome Contig53181_59083: 0.05%, 4/7,588
## Chromosome Contig59084_64831: 0.11%, 7/6,364
## Chromosome Contig64832_70524: 0.20%, 11/5,633
## Chromosome Contig70525_76135: 0.15%, 7/4,689
## Chromosome Contig76136_81765: 0.12%, 5/4,040
## Chromosome Contig81766_87352: 0.10%, 4/4,163
## Chromosome Contig87353_92929: 0.18%, 5/2,773
## Chromosome Contig92931_98526: 0.15%, 4/2,712
## Chromosome Contig98527_104152: 0.13%, 4/2,984
## 212 markers are selected in total.
snpset.adult.id.4Colony <- unlist(unname(snpset.adult.4Colony))
snpgdsCreateGenoSet(src.fn="../qc-processing/gatk/genotypes-adult.gds",
dest.fn="../qc-processing/colony/genotypes-adult-4Colony.gds",
snp.id=snpset.adult.id.4Colony)
## Create a GDS genotype file:
## The new dataset consists of 53 samples and 212 SNPs
## write sample.id
## write snp.id
## write snp.rs.id
## write snp.position
## write snp.chromosome
## write snp.allele
## SNP genotypes are stored in SNP-major mode (Sample X SNP).
SNPRelate codes alleles this way: “There are four possible values stored in the variable genotype: 0, 1, 2 and 3. For bi-allelic SNP sites, “0” indicates two B alleles, “1” indicates one A allele and one B allele, “2” indicates two A alleles, and “3” is a missing genotype. (When you) take out genotype data with sample and SNP IDs, and four possible values are returned 0, 1, 2 and NA (3 is replaced by NA).
Colony codes alleles this way: “The first column gives the individual ID (a string containing a maximum of 20 letters and/or numbers, no other characters are allowed), the 2nd and 3rd columns give the alleles observed for the individual at the first locus, the 4th and 5th give the alleles observed for the individual at the 2nd locus … An allele is identified by an integer, in the range of 1~999999999. If the locus is a dominant marker, then only one (instead of 2) column is required for the marker, and the value for the genotype should be either 1 (dominant phenotype, presence of a band) or 2 (recessive phenotype, absence of a band). Missing genotypes are indicated by 0 0 for a codominant marker and 0 for a dominant marker.” Also, there cannot be any NA values in the Colony genotype data.
I wanted to use the R program radiator to create a Colony formatted genotype file. But after a TON of attempts I wasn’t able to get my SNPRelate .gds data into radiator directly nor could I successfully import a .vcf file.
So, here I manually re-code the genotype matrix.
# Extract genotype matrix
geno.matrix4Colony.adult <- snpgdsGetGeno(gdsobj="../qc-processing/colony/genotypes-adult-4Colony.gds", with.id=TRUE)
## Genotype matrix: 53 samples X 212 SNPs
paste("No. of adult SNPs for Colony sibship analysis: ", ncol(geno.matrix4Colony.adult$genotype), sep="")
## [1] "No. of adult SNPs for Colony sibship analysis: 212"
paste("No. of adult samples for Colony sibship analysis: ", nrow(geno.matrix4Colony.adult$genotype), sep="")
## [1] "No. of adult samples for Colony sibship analysis: 53"
# Replace SNPRelate numeric allele coding with character allele coding
a <- geno.matrix4Colony.adult$genotype %>% as_data_frame() %>%
mutate_all(funs(str_replace(., "0", "BB"))) %>%
mutate_all(funs(str_replace(., "1", "AB"))) %>%
mutate_all(funs(str_replace(., "2", "AA")))
## Warning: `as_data_frame()` was deprecated in tibble 2.0.0.
## Please use `as_tibble()` instead.
## The signature and semantics have changed, see `?as_tibble`.
## Warning: The `x` argument of `as_tibble.matrix()` must have unique column names if `.name_repair` is omitted as of tibble 2.0.0.
## Using compatibility `.name_repair`.
# Duplicate each column. Now there's 2 columns per locus, both containing the same bi-allelic genotype info.
b <- a[rep(names(a), rep(2, times=ncol(a)))] %>% clean_names()
ncol(b) == ncol(a)*2 #should be TRUE
## [1] TRUE
# For each locus, extract the first allele in column 1, and the second allele in column 2.
c <- b #duplicate dataframe
odds <- seq(from = 1, to = ncol(c), by = 2)
evens <- seq(from = 2, to = ncol(c), by = 2)
odds.names <- colnames(b[odds])
evens.names <- colnames(b[evens])
for (i in 1:length(odds)) {
c[[odds.names[i]]] <- substring(b[[odds.names[i]]], first = 1, last = 1)
}
for (i in 1:length(evens)) {
c[[evens.names[i]]] <- substring(b[[evens.names[i]]], first = 2, last = 2)
}
# Now replace the character allele information with Colony's way of numeric coding.
# There are 2 columns for each locus (e.g. v1 and v1_2), and only 3 options for each allele:
# 1= B allele
# 2= A allele
# 0= missing allele
(geno.matrix4Colony.adult.recode <- c %>%
mutate_all(funs(str_replace(., "B", "1"))) %>%
mutate_all(funs(str_replace(., "A", "2"))) %>%
mutate(., across(everything(), ~replace_na(.x, 0))))
## # A tibble: 53 x 424
## v1 v1_2 v2 v2_2 v3 v3_2 v4 v4_2 v5 v5_2 v6 v6_2 v7
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 2 1 2 2 2 1 2 2 2 1 1 1 2
## 2 2 2 2 2 1 1 1 1 2 1 1 1 2
## 3 2 2 2 1 2 1 2 1 2 2 2 2 2
## 4 2 2 2 1 2 1 1 1 1 1 2 2 2
## 5 2 2 1 1 2 2 2 1 2 1 2 2 1
## 6 2 2 2 1 2 1 2 1 2 1 1 1 2
## 7 2 2 2 2 2 1 2 1 2 2 2 2 2
## 8 2 2 1 1 2 1 1 1 1 1 1 1 2
## 9 2 2 1 1 2 1 1 1 1 1 1 1 2
## 10 2 1 2 1 2 1 1 1 2 1 2 2 2
## # … with 43 more rows, and 411 more variables: v7_2 <chr>, v8 <chr>,
## # v8_2 <chr>, v9 <chr>, v9_2 <chr>, v10 <chr>, v10_2 <chr>, v11 <chr>,
## # v11_2 <chr>, v12 <chr>, v12_2 <chr>, v13 <chr>, v13_2 <chr>, v14 <chr>,
## # v14_2 <chr>, v15 <chr>, v15_2 <chr>, v16 <chr>, v16_2 <chr>, v17 <chr>,
## # v17_2 <chr>, v18 <chr>, v18_2 <chr>, v19 <chr>, v19_2 <chr>, v20 <chr>,
## # v20_2 <chr>, v21 <chr>, v21_2 <chr>, v22 <chr>, v22_2 <chr>, v23 <chr>,
## # v23_2 <chr>, v24 <chr>, v24_2 <chr>, v25 <chr>, v25_2 <chr>, v26 <chr>,
## # v26_2 <chr>, v27 <chr>, v27_2 <chr>, v28 <chr>, v28_2 <chr>, v29 <chr>,
## # v29_2 <chr>, v30 <chr>, v30_2 <chr>, v31 <chr>, v31_2 <chr>, v32 <chr>,
## # v32_2 <chr>, v33 <chr>, v33_2 <chr>, v34 <chr>, v34_2 <chr>, v35 <chr>,
## # v35_2 <chr>, v36 <chr>, v36_2 <chr>, v37 <chr>, v37_2 <chr>, v38 <chr>,
## # v38_2 <chr>, v39 <chr>, v39_2 <chr>, v40 <chr>, v40_2 <chr>, v41 <chr>,
## # v41_2 <chr>, v42 <chr>, v42_2 <chr>, v43 <chr>, v43_2 <chr>, v44 <chr>,
## # v44_2 <chr>, v45 <chr>, v45_2 <chr>, v46 <chr>, v46_2 <chr>, v47 <chr>,
## # v47_2 <chr>, v48 <chr>, v48_2 <chr>, v49 <chr>, v49_2 <chr>, v50 <chr>,
## # v50_2 <chr>, v51 <chr>, v51_2 <chr>, v52 <chr>, v52_2 <chr>, v53 <chr>,
## # v53_2 <chr>, v54 <chr>, v54_2 <chr>, v55 <chr>, v55_2 <chr>, v56 <chr>,
## # v56_2 <chr>, v57 <chr>, …
# Save genotype matrix to file, while also adding a column with sample number
write_delim(cbind(sampleID=paste("O", geno.matrix4Colony.adult$sample.id, sep=""),
geno.matrix4Colony.adult.recode),
file="../qc-processing/colony/geno.matrix.adult.tab", delim=" ", col_names=FALSE)
I created a text file that will be the input for Colony. I used Colony’s example input file to do so.
Here’s a snapshot of the input file, truncating the genotype info (there are 53 samples, but here I only show 2)
head -n 28 ../qc-processing/colony/colony2_adults.dat
tail -n 12 ../qc-processing/colony/colony2_adults.dat
## 'adult_snps' !Dataset name
## 'adult_colony' !Output file name
## 53 ! Number of offspring in the sample
## 212 ! Number of loci
## 100 ! Seed for random number generator
## 0 ! 0/1=Not updating/updating allele frequency
## 2 ! 2/1=Dioecious/Monoecious species
## 0 ! 0/1=No inbreeding/inbreeding
## 0 ! 0/1=Diploid species/HaploDiploid species
## 0 0 ! 0/1=Polygamy/Monogamy for males & females
## 0 ! 0/1=Clone inference =No/Yes
## 0 ! 0/1=Full sibship size scaling =No/Yes
## 0 ! 0,1,2,3=No,weak,medium,strong sibship size prior; mean paternal & meteral sibship size
## 0 ! 0/1=Unknown/Known population allele frequency
## 1 ! Number of runs
## 2 ! Length of run
## 0 ! 0/1=Monitor method by Iterate#/Time in second
## 100000 ! Monitor interval in Iterate# / in seconds
## 0 ! non-Windows version
## 1 ! Fulllikelihood
## 3 ! 1/2/3=low/medium/high Precision for Fulllikelihood
##
## V@ !Marker names
## 0@ !Marker types, 0/1 = codominant/dominant
## 0.000@ !Allelic dropout rate
## 0.0001@ !false allele rate
##
## O291 2 1 2 2 2 1 2 2 2 1 1 1 2 2 1 1 2 2 2 2 2 2 2 1 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 1 2 1 2 2 2 2 2 2 1 1 1 1 2 1 2 1 2 2 2 1 2 2 2 2 1 1 2 1 2 2 2 2 2 1 2 2 1 1 2 2 2 1 2 1 2 2 2 2 2 1 2 2 2 2 2 2 1 1 2 1 1 1 1 1 2 2 2 1 2 2 2 2 2 2 2 1 2 1 2 2 2 1 2 2 1 1 2 2 2 2 2 1 2 1 2 2 2 2 2 2 2 1 2 2 2 1 2 2 2 1 2 2 2 1 2 1 2 1 2 2 2 2 2 1 2 1 1 1 2 2 2 2 2 2 2 2 2 2 2 1 1 1 2 1 2 1 2 2 2 2 2 1 1 1 2 1 2 2 2 2 2 1 2 1 2 2 2 2 1 1 2 1 2 1 2 2 1 1 1 1 1 1 2 2 2 2 2 2 2 1 1 1 2 1 2 1 2 1 2 2 2 1 2 1 2 1 2 1 2 2 2 2 2 1 2 1 2 2 2 2 2 1 2 2 1 1 1 1 2 2 2 1 1 1 2 2 2 2 2 2 1 1 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1 1 2 1 2 2 2 2 2 2 2 1 2 1 2 2 1 1 2 2 1 1 2 1 1 1 2 1 2 2 2 2 2 2 2 1 2 1 2 1 2 2 2 2 2 2 2 2 2 1 2 2 2 1 1 1 2 1 2 2 2 2 2 1 1 1 2 1 2 2 2 1 2 2 2 1 2 2 2 1 2 1 2 2 2 1 2 2 2 1 2 1 2 1 2 2 2 2 2 1 2 1 1 1 2 2 2 1 2 2 2 2 2 1 2 2 2 1
## O349 2 1 2 1 1 1 2 2 2 2 2 2 2 2 2 2 1 1 2 1 2 2 2 1 2 1 2 1 2 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 1 2 2 2 2 1 1 1 1 2 2 2 2 2 2 2 2 1 1 2 2 2 2 1 1 2 1 1 1 2 2 2 1 1 1 2 2 2 1 2 1 2 2 2 1 2 2 1 1 2 1 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 1 1 1 1 1 1 2 1 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1 2 2 2 1 2 2 2 1 1 1 2 2 2 1 2 1 1 1 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 1 1 1 2 1 2 2 2 2 2 2 2 2 2 2 2 1 2 1 2 2 2 1 2 2 1 1 2 2 2 1 2 1 2 2 1 1 2 1 2 2 2 2 2 1 2 2 2 2 1 1 2 1 2 1 2 2 2 1 2 2 1 1 2 1 1 1 1 1 2 2 2 1 2 1 2 2 1 1 2 1 2 2 2 2 2 2 2 2 1 1 2 2 2 2 1 1 1 1 2 2 2 2 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 1 1 2 2 2 1 2 1 2 2 1 1 2 2 2 2 2 1 2 1 2 1 2 2 2 2 2 1 2 2 2 1 2 2 1 1 2 2 2 2 2 2 2 1 2 2 1 1 2 2 2 2 2 1 2 1
##
## 0 0 !prob. of dad/mum included in the candidates
## 0 0 !numbers of candiadte males & females
## 0 0 !#known fater-offspring dyads, paternity exclusion threshold
## 0 0 !#known moter-offspring dyads, maternity exclusion threshold
## 0 !#known paternal sibship with unknown fathers
## 0 !#known maternal sibship with unknown mothers
## 0 !#known paternity exclusions
## 0 !#known maternity exclusions
## 0 !#known paternal sibship exclusions
## 0 !#known maternal sibship exclusions
Note: I have to be in the colony directory to run it, so I change my directory in the same code chunk prior to executing the ./colony2s.out script
cd ../qc-processing/colony/
./colony2s.out IFN:colony2_adults.dat
cat ../qc-processing/colony/adult_colony.BestCluster
## ClusterIndex Probability OffspringID FatherID MotherID
## 1 0.7953 O291 *1 #1
## 1 0.7953 O293 *3 #1
## 1 0.7953 O294 *3 #3
## 1 0.7953 O302 *1 #3
## 1 0.7953 O305 *1 #3
## 1 0.7953 O307 *1 #3
## 1 0.7953 O309 *1 #3
## 2 0.9974 O292 *2 #2
## 2 0.9974 O296 *2 #5
## 2 0.9974 O299 *2 #5
## 2 0.9974 O301 *2 #2
## 3 0.9199 O295 *4 #4
## 3 0.9199 O303 *4 #4
## 3 0.9199 O304 *4 #4
## 3 0.9199 O308 *4 #4
## 4 0.9864 O298 *5 #6
## 4 0.9864 O306 *6 #6
## 5 0.9859 O311 *7 #7
## 5 0.9859 O315 *7 #11
## 5 0.9859 O318 *7 #12
## 5 0.9859 O328 *7 #11
## 5 0.9859 O329 *7 #11
## 6 0.7866 O312 *8 #8
## 6 0.7866 O314 *8 #10
## 6 0.7866 O316 *8 #8
## 6 0.7866 O317 *8 #10
## 6 0.7866 O321 *8 #10
## 6 0.7866 O323 *8 #8
## 6 0.7866 O327 *8 #10
## 7 0.2345 O313 *9 #9
## 7 0.2345 O319 *10 #13
## 7 0.2345 O322 *9 #13
## 7 0.2345 O324 *9 #9
## 7 0.2345 O326 *9 #13
## 8 0.3050 O325 *11 #14
## 8 0.3050 O333 *14 #17
## 8 0.3050 O337 *14 #14
## 9 0.9090 O331 *12 #15
## 9 0.9090 O336 *12 #19
## 9 0.9090 O341 *17 #15
## 9 0.9090 O342 *12 #22
## 9 0.9090 O345 *19 #22
## 9 0.9090 O346 *19 #15
## 9 0.9090 O348 *12 #22
## 10 0.9796 O332 *13 #16
## 10 0.9796 O334 *15 #16
## 10 0.9796 O339 *13 #21
## 10 0.9796 O347 *13 #16
## 11 0.9952 O335 *16 #18
## 11 0.9952 O338 *16 #20
## 11 0.9952 O343 *16 #20
## 12 0.9870 O344 *18 #23
## 12 0.9870 O349 *18 #23
colony.clusters.adult <- read_table("../qc-processing/colony/adult_colony.BestCluster", col_names=TRUE) %>% clean_names() %>%
mutate_all(funs(str_replace(., "\\*|#", ""))) %>%
mutate(cluster_index=as.numeric(cluster_index), father_id=as.numeric(father_id), mother_id=as.numeric(mother_id), probability=as.numeric(probability)) %>%
mutate(offspring_id=str_replace(offspring_id, "O", "")) %>% as.data.frame() %>% left_join(sample.info.adult, by = c("offspring_id"="sample"))
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## ClusterIndex = col_double(),
## Probability = col_double(),
## OffspringID = col_character(),
## FatherID = col_character(),
## MotherID = col_character()
## )
save(colony.clusters.adult, file = "../qc-processing/colony/best-cluster-adult")
# Plot mother ~ father ID
#ggplotly(
ggplot(colony.clusters.adult, aes(x=father_id, y=mother_id, size=probability, colour=population, shape=pCO2.parent)) + geom_jitter() + theme_minimal() + scale_shape_manual(values=c(1,2)) + scale_size_continuous(guide="none") #)
# How many fathers and mothers does each population + pCO2 group have?
colony.clusters.adult %>%
group_by(population, pCO2.parent) %>%
summarise(n_fathers=n_distinct(father_id),n_mothers=n_distinct(mother_id))
## `summarise()` has grouped output by 'population'. You can override using the `.groups` argument.
## # A tibble: 6 x 4
## # Groups: population [3]
## population pCO2.parent n_fathers n_mothers
## <chr> <chr> <int> <int>
## 1 Dabob Bay Ambient 4 4
## 2 Dabob Bay High 5 6
## 3 Fidalgo Bay Ambient 6 5
## 4 Fidalgo Bay High 5 8
## 5 Oyster Bay C1 Ambient 4 6
## 6 Oyster Bay C1 High 4 7
# How many fathers and mothers does each population have
colony.clusters.adult %>%
group_by(population) %>%
summarise(n_fathers=n_distinct(father_id),n_mothers=n_distinct(mother_id))
## # A tibble: 3 x 3
## population n_fathers n_mothers
## <chr> <int> <int>
## 1 Dabob Bay 6 6
## 2 Fidalgo Bay 8 10
## 3 Oyster Bay C1 5 8
snpset.larvae.4Colony <- snpgdsLDpruning(genofile.larvae, maf=.2, missing.rate=0.15, autosome.only=FALSE)
## SNP pruning based on LD:
## Excluding 127,940 SNPs (monomorphic: TRUE, MAF: 0.2, missing rate: 0.15)
## # of samples: 76
## # of SNPs: 282
## using 1 thread
## sliding window: 500,000 basepairs, Inf SNPs
## |LD| threshold: 0.2
## method: composite
## Chromosome Contig0_5313: 0.13%, 20/15,947
## Chromosome Contig104153_109739: 0.25%, 4/1,625
## Chromosome Contig109741_116186: 0.15%, 2/1,328
## Chromosome Contig11179_21262: 0.21%, 26/12,305
## Chromosome Contig116188_123130: 0.50%, 6/1,207
## Chromosome Contig123131_129982: 0.27%, 3/1,099
## Chromosome Contig129983_136896: 0.33%, 4/1,222
## Chromosome Contig136897_143777: 0.23%, 3/1,321
## Chromosome Contig143780_150710: 0.25%, 3/1,179
## Chromosome Contig150711_166372: 0.19%, 2/1,031
## Chromosome Contig166374_181896: 0.09%, 1/1,077
## Chromosome Contig181898_197428: 0.09%, 1/1,122
## Chromosome Contig197431_213218: 0.18%, 2/1,107
## Chromosome Contig21263_28135: 0.18%, 24/13,677
## Chromosome Contig213221_398012: 0.17%, 2/1,191
## Chromosome Contig28136_34745: 0.17%, 22/12,701
## Chromosome Contig34748_41067: 0.14%, 15/10,383
## Chromosome Contig41068_47193: 0.16%, 13/7,913
## Chromosome Contig47194_53180: 0.13%, 9/6,968
## Chromosome Contig5314_11178: 0.14%, 5/3,691
## Chromosome Contig53181_59083: 0.19%, 11/5,690
## Chromosome Contig59084_64831: 0.23%, 10/4,432
## Chromosome Contig64832_70524: 0.30%, 12/4,031
## Chromosome Contig70525_76135: 0.32%, 11/3,438
## Chromosome Contig76136_81765: 0.07%, 2/2,871
## Chromosome Contig81766_87352: 0.11%, 3/2,843
## Chromosome Contig87353_92929: 0.31%, 6/1,952
## Chromosome Contig92931_98526: 0.42%, 8/1,900
## Chromosome Contig98527_104152: 0.10%, 2/1,979
## 232 markers are selected in total.
snpset.larvae.id.4Colony <- unlist(unname(snpset.larvae.4Colony))
snpgdsCreateGenoSet(src.fn="../qc-processing/gatk/genotypes-larvae.gds",
dest.fn="../qc-processing/colony/genotypes-larvae-4Colony.gds",
snp.id=snpset.larvae.id.4Colony)
## Create a GDS genotype file:
## The new dataset consists of 76 samples and 232 SNPs
## write sample.id
## write snp.id
## write snp.rs.id
## write snp.position
## write snp.chromosome
## write snp.allele
## SNP genotypes are stored in SNP-major mode (Sample X SNP).
# Extract genotype matrix
geno.matrix4Colony.larvae <- snpgdsGetGeno(gdsobj="../qc-processing/colony/genotypes-larvae-4Colony.gds", with.id=TRUE)
## Genotype matrix: 76 samples X 232 SNPs
paste("No. of larvae SNPs for Colony sibship analysis: ", ncol(geno.matrix4Colony.larvae$genotype), sep="")
## [1] "No. of larvae SNPs for Colony sibship analysis: 232"
paste("No. of larvae samples for Colony sibship analysis: ", nrow(geno.matrix4Colony.larvae$genotype), sep="")
## [1] "No. of larvae samples for Colony sibship analysis: 76"
# Replace SNPRelate numeric allele coding with character allele coding
a <- geno.matrix4Colony.larvae$genotype %>% as_data_frame() %>%
mutate_all(funs(str_replace(., "0", "BB"))) %>%
mutate_all(funs(str_replace(., "1", "AB"))) %>%
mutate_all(funs(str_replace(., "2", "AA")))
# Duplicate each column. Now there's 2 columns per locus, both containing the same bi-allelic genotype info.
b <- a[rep(names(a), rep(2, times=ncol(a)))] %>% clean_names()
ncol(b) == ncol(a)*2 #should be TRUE
## [1] TRUE
# For each locus, extract the first allele in column 1, and the second allele in column 2.
c <- b #duplicate dataframe
odds <- seq(from = 1, to = ncol(c), by = 2)
evens <- seq(from = 2, to = ncol(c), by = 2)
odds.names <- colnames(b[odds])
evens.names <- colnames(b[evens])
for (i in 1:length(odds.names)) {
c[[odds.names[i]]] <- substring(b[[odds.names[i]]], first = 1, last = 1)
}
for (i in 1:length(evens.names)) {
c[[evens.names[i]]] <- substring(b[[evens.names[i]]], first = 2, last = 2)
}
# Now replace the character allele information with Colony's way of numeric coding.
# There are 2 columns for each locus (e.g. v1 and v1_2), and only 3 options for each allele:
# 1= B allele
# 2= A allele
# 0= missing allele
(geno.matrix4Colony.larvae.recode <- c %>%
mutate_all(funs(str_replace(., "B", "1"))) %>%
mutate_all(funs(str_replace(., "A", "2"))) %>%
mutate(., across(everything(), ~replace_na(.x, 0))))
## # A tibble: 76 x 464
## v1 v1_2 v2 v2_2 v3 v3_2 v4 v4_2 v5 v5_2 v6 v6_2 v7
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 2 2 2 1 2 2 0 0 2 2 2 2 2
## 2 0 0 2 2 1 1 0 0 0 0 0 0 2
## 3 0 0 2 2 0 0 0 0 2 2 2 1 0
## 4 2 2 2 1 2 2 2 2 2 2 1 1 2
## 5 1 1 2 1 2 2 1 1 2 2 2 2 2
## 6 0 0 2 2 2 2 1 1 2 2 2 2 2
## 7 0 0 0 0 2 2 0 0 0 0 0 0 2
## 8 0 0 0 0 0 0 0 0 2 2 2 2 2
## 9 2 1 2 1 2 2 2 2 2 2 2 2 0
## 10 2 1 2 1 2 2 2 2 1 1 2 2 2
## # … with 66 more rows, and 451 more variables: v7_2 <chr>, v8 <chr>,
## # v8_2 <chr>, v9 <chr>, v9_2 <chr>, v10 <chr>, v10_2 <chr>, v11 <chr>,
## # v11_2 <chr>, v12 <chr>, v12_2 <chr>, v13 <chr>, v13_2 <chr>, v14 <chr>,
## # v14_2 <chr>, v15 <chr>, v15_2 <chr>, v16 <chr>, v16_2 <chr>, v17 <chr>,
## # v17_2 <chr>, v18 <chr>, v18_2 <chr>, v19 <chr>, v19_2 <chr>, v20 <chr>,
## # v20_2 <chr>, v21 <chr>, v21_2 <chr>, v22 <chr>, v22_2 <chr>, v23 <chr>,
## # v23_2 <chr>, v24 <chr>, v24_2 <chr>, v25 <chr>, v25_2 <chr>, v26 <chr>,
## # v26_2 <chr>, v27 <chr>, v27_2 <chr>, v28 <chr>, v28_2 <chr>, v29 <chr>,
## # v29_2 <chr>, v30 <chr>, v30_2 <chr>, v31 <chr>, v31_2 <chr>, v32 <chr>,
## # v32_2 <chr>, v33 <chr>, v33_2 <chr>, v34 <chr>, v34_2 <chr>, v35 <chr>,
## # v35_2 <chr>, v36 <chr>, v36_2 <chr>, v37 <chr>, v37_2 <chr>, v38 <chr>,
## # v38_2 <chr>, v39 <chr>, v39_2 <chr>, v40 <chr>, v40_2 <chr>, v41 <chr>,
## # v41_2 <chr>, v42 <chr>, v42_2 <chr>, v43 <chr>, v43_2 <chr>, v44 <chr>,
## # v44_2 <chr>, v45 <chr>, v45_2 <chr>, v46 <chr>, v46_2 <chr>, v47 <chr>,
## # v47_2 <chr>, v48 <chr>, v48_2 <chr>, v49 <chr>, v49_2 <chr>, v50 <chr>,
## # v50_2 <chr>, v51 <chr>, v51_2 <chr>, v52 <chr>, v52_2 <chr>, v53 <chr>,
## # v53_2 <chr>, v54 <chr>, v54_2 <chr>, v55 <chr>, v55_2 <chr>, v56 <chr>,
## # v56_2 <chr>, v57 <chr>, …
# Save genotype matrix to file, while also adding a column with sample number
write_delim(cbind(sampleID=paste("O", geno.matrix4Colony.larvae$sample.id, sep=""),
geno.matrix4Colony.larvae.recode),
file="../qc-processing/colony/geno.matrix.larvae.tab", delim=" ", col_names=FALSE)
I created a text file that will be the input for Colony. I used Colony’s example input file as template and edited by hand. For the genotype matrix I selected all contents in the “geno.matrix.larvae.tab” file and pasted it into the text file.
Here’s a snapshot of the input file, truncating the genotype info (there are 76 samples, but here I only show 4)
head -n 30 ../qc-processing/colony/colony2_larvae.dat
tail -n 12 ../qc-processing/colony/colony2_larvae.dat
## 'larvae_snps' !Dataset name
## 'larvae_colony' !Output file name
## 76 ! Number of offspring in the sample
## 231 ! Number of loci
## 100 ! Seed for random number generator
## 0 ! 0/1=Not updating/updating allele frequency
## 2 ! 2/1=Dioecious/Monoecious species
## 0 ! 0/1=No inbreeding/inbreeding
## 0 ! 0/1=Diploid species/HaploDiploid species
## 0 0 ! 0/1=Polygamy/Monogamy for males & females
## 0 ! 0/1=Clone inference =No/Yes
## 0 ! 0/1=Full sibship size scaling =No/Yes
## 0 ! 0,1,2,3=No,weak,medium,strong sibship size prior; mean paternal & meteral sibship size
## 0 ! 0/1=Unknown/Known population allele frequency
## 1 ! Number of runs
## 2 ! Length of run
## 0 ! 0/1=Monitor method by Iterate#/Time in second
## 100000 ! Monitor interval in Iterate# / in seconds
## 0 ! non-Windows version
## 1 ! Fulllikelihood
## 3 ! 1/2/3=low/medium/high Precision for Fulllikelihood
##
## V@ !Marker names
## 0@ !Marker types, 0/1 = codominant/dominant
## 0.000@ !Allelic dropout rate
## 0.0001@ !false allele rate
##
## O034 2 2 2 1 2 2 0 0 2 2 2 2 2 1 2 1 2 2 2 2 0 0 0 0 1 1 2 2 0 0 2 1 2 2 2 2 1 1 2 1 2 2 1 1 2 2 2 1 2 2 2 2 2 1 2 2 1 1 0 0 2 1 2 2 2 2 2 2 2 2 2 1 2 1 2 2 2 2 2 2 2 1 2 1 0 0 2 2 2 2 2 2 2 2 2 2 0 0 2 2 2 2 2 1 2 1 2 1 1 1 0 0 2 1 2 2 0 0 2 2 1 1 1 1 1 1 2 1 2 1 2 2 2 2 2 2 2 2 1 1 2 2 0 0 2 2 2 2 1 1 2 1 0 0 2 2 2 1 0 0 2 2 2 2 1 1 2 2 2 2 2 1 0 0 2 2 0 0 2 1 1 1 2 2 2 2 2 2 2 2 0 0 2 2 2 2 0 0 2 2 2 2 2 2 1 1 2 1 2 2 0 0 1 1 2 2 2 1 2 2 2 2 2 2 2 2 0 0 0 0 2 2 0 0 2 1 2 2 2 2 2 1 2 1 2 2 2 2 2 1 2 2 0 0 1 1 2 1 2 1 2 2 2 1 1 1 0 0 2 1 0 0 2 2 2 2 2 1 0 0 0 0 0 0 2 2 2 2 0 0 2 2 2 2 2 2 0 0 2 1 2 1 2 2 0 0 2 1 2 1 1 1 2 2 2 2 2 2 1 1 0 0 2 2 0 0 2 2 2 2 2 2 0 0 2 1 2 1 2 1 1 1 2 1 0 0 2 2 2 1 2 2 2 2 2 2 2 2 0 0 2 2 2 1 2 2 1 1 2 1 2 2 2 2 1 1 1 1 1 1 1 1 2 2 2 2 0 0 2 2 0 0 2 2 2 1 0 0 2 1 0 0 2 2 2 2 2 2 2 2 2 2 1 1 2 1 2 1 2 1 2 1 2 2 2 2 2 2 0 0 0 0 2 2 2 1 2 2 2 2 0 0 2 2 2 1 2 1 2 1 2 2 0 0 2 2 1 1 2 1 0 0
## O035 0 0 2 2 1 1 0 0 0 0 0 0 2 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 2 0 0 0 0 0 0 2 2 0 0 2 2 0 0 0 0 0 0 0 0 0 0 0 0 2 2 2 2 0 0 0 0 0 0 0 0 0 0 2 2 0 0 2 2 0 0 0 0 1 1 2 2 0 0 2 2 0 0 1 1 2 2 0 0 0 0 1 1 2 2 2 2 2 2 0 0 0 0 0 0 0 0 2 2 0 0 0 0 0 0 0 0 0 0 2 2 0 0 0 0 0 0 0 0 0 0 0 0 2 2 0 0 0 0 0 0 0 0 0 0 2 2 2 2 0 0 0 0 0 0 2 2 2 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 2 0 0 0 0 0 0 2 2 2 2 2 2 2 2 0 0 2 2 0 0 0 0 0 0 2 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 2 0 0 2 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 2 0 0 2 2 2 2 2 2 0 0 0 0 2 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 2 2 2 0 0 0 0 2 2 0 0 0 0 2 2 0 0 0 0 0 0 2 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 2 0 0 0 0 0 0 2 2 2 1 0 0 0 0 0 0 0 0 0 0 2 2 2 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 2 2 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 2 0 0 0 0 0 0 0 0 2 2 0 0 0 0 1 1 0 0 0 0 0 0 2 2
## O037 0 0 2 2 0 0 0 0 2 2 2 1 0 0 2 2 0 0 0 0 1 1 0 0 0 0 0 0 1 1 2 2 2 2 0 0 0 0 0 0 2 2 0 0 0 0 0 0 2 2 0 0 0 0 0 0 0 0 0 0 0 0 2 2 0 0 2 2 2 2 2 1 1 1 0 0 2 2 2 2 0 0 2 2 0 0 2 2 2 2 0 0 2 2 2 2 0 0 2 2 2 2 2 2 2 2 0 0 1 1 0 0 0 0 0 0 2 2 0 0 0 0 2 1 2 2 0 0 0 0 0 0 2 2 2 2 2 2 2 2 0 0 0 0 0 0 0 0 0 0 2 2 2 2 0 0 0 0 2 2 0 0 0 0 0 0 0 0 2 2 0 0 0 0 0 0 0 0 0 0 2 2 2 1 0 0 2 2 0 0 2 2 0 0 0 0 2 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 2 0 0 0 0 2 2 2 1 0 0 2 2 0 0 2 2 1 1 0 0 0 0 2 2 2 2 0 0 2 2 2 1 0 0 0 0 2 2 0 0 2 2 2 2 1 1 0 0 0 0 0 0 0 0 0 0 0 0 2 2 0 0 0 0 2 2 2 2 0 0 0 0 2 2 0 0 0 0 0 0 2 2 2 2 2 1 2 2 2 2 2 2 0 0 2 2 0 0 0 0 0 0 2 2 2 2 2 2 2 2 2 2 2 2 0 0 2 2 0 0 0 0 2 2 0 0 1 1 0 0 0 0 0 0 0 0 0 0 2 2 0 0 0 0 0 0 0 0 0 0 0 0 2 1 2 2 2 2 1 1 0 0 1 1 0 0 0 0 0 0 0 0 2 2 2 2 2 2 0 0 0 0 0 0 2 2 2 2 0 0 2 2 2 2 2 2 0 0 0 0 2 2 0 0 0 0 0 0 0 0 0 0 2 2 2 2 0 0 0 0 0 0 0 0 0 0 0 0 2 2 2 2 2 2 2 2 2 2 0 0 0 0 0 0 0 0
## O565 2 2 2 2 0 0 0 0 2 2 0 0 2 2 2 2 0 0 2 2 0 0 2 2 2 1 2 2 2 2 0 0 2 2 2 2 0 0 2 2 0 0 0 0 0 0 2 2 0 0 2 2 2 2 2 2 0 0 0 0 0 0 2 2 2 2 2 2 2 2 2 1 0 0 0 0 2 2 2 2 0 0 0 0 2 2 2 2 0 0 0 0 2 2 2 2 2 2 0 0 1 1 2 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 2 0 0 2 2 2 2 2 2 2 2 0 0 0 0 0 0 0 0 0 0 0 0 2 2 1 1 0 0 0 0 0 0 0 0 0 0 2 2 2 2 0 0 0 0 1 1 0 0 2 2 2 2 0 0 2 1 0 0 0 0 2 2 2 2 2 2 2 2 2 2 1 1 2 1 2 2 2 1 2 2 0 0 0 0 2 2 2 2 2 2 0 0 0 0 2 2 1 1 0 0 1 1 0 0 2 2 2 2 0 0 0 0 2 2 2 2 2 2 0 0 0 0 1 1 1 1 2 2 2 2 0 0 2 2 2 2 0 0 0 0 2 2 2 2 0 0 2 2 2 2 2 2 0 0 0 0 2 2 2 2 0 0 0 0 0 0 0 0 0 0 0 0 2 2 2 2 2 2 2 2 0 0 0 0 0 0 0 0 2 1 2 2 0 0 2 2 0 0 2 2 2 2 0 0 2 2 2 2 1 1 0 0 0 0 0 0 0 0 1 1 0 0 0 0 2 2 2 2 2 2 0 0 2 2 2 2 1 1 2 2 2 2 0 0 2 2 2 2 0 0 2 2 0 0 0 0 2 2 0 0 0 0 0 0 0 0 1 1 0 0 2 2 2 2 0 0 2 1 0 0 0 0 0 0 0 0 2 2 0 0 0 0 2 2 0 0 2 2 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 0 2 2 2 2 0 0 0 0 2 2 0 0 2 2 2 1 2 1 0 0 2 2 1 1
##
## 0 0 !prob. of dad/mum included in the candidates
## 0 0 !numbers of candiadte males & females
## 0 0 !#known fater-offspring dyads, paternity exclusion threshold
## 0 0 !#known moter-offspring dyads, maternity exclusion threshold
## 0 !#known paternal sibship with unknown fathers
## 0 !#known maternal sibship with unknown mothers
## 0 !#known paternity exclusions
## 0 !#known maternity exclusions
## 0 !#known paternal sibship exclusions
## 0 !#known maternal sibship exclusions
Note: I have to be in the colony directory to run it, so I change my directory in the same code chunk prior to executing the ./colony2s.out script
cd ../qc-processing/colony/
./colony2s.out IFN:colony2_larvae.dat
cat ../qc-processing/colony/larvae_colony.BestCluster
## ClusterIndex Probability OffspringID FatherID MotherID
## 1 0.6778 O034 *1 #1
## 1 0.6778 O037 *3 #1
## 1 0.6778 O044 *3 #1
## 1 0.6778 O046 *7 #1
## 1 0.6778 O047 *7 #7
## 1 0.6778 O483 *3 #23
## 1 0.6778 O485 *23 #25
## 1 0.6778 O487 *23 #25
## 1 0.6778 O526 *7 #25
## 1 0.6778 O528 *23 #32
## 2 1.0000 O035 *2 #2
## 2 1.0000 O412 *2 #2
## 2 1.0000 O441 *2 #2
## 2 1.0000 O565 *2 #2
## 3 0.1356 O039 *4 #3
## 3 0.1356 O484 *22 #24
## 3 0.1356 O513 *4 #24
## 3 0.1356 O531 *22 #34
## 3 0.1356 O542 *33 #3
## 3 0.1356 O543 *22 #36
## 3 0.1356 O552 *4 #37
## 3 0.1356 O553 *34 #36
## 4 0.8784 O041 *5 #4
## 4 0.8784 O043 *5 #5
## 5 0.4926 O045 *6 #6
## 5 0.4926 O489 *24 #26
## 5 0.4926 O522 *27 #29
## 5 0.4926 O523 *28 #30
## 5 0.4926 O524 *28 #31
## 5 0.4926 O525 *28 #6
## 5 0.4926 O527 *29 #31
## 5 0.4926 O562 *27 #30
## 5 0.4926 O564 *24 #30
## 6 0.9721 O401 *8 #8
## 6 0.9721 O402 *9 #8
## 6 0.9721 O403 *10 #8
## 6 0.9721 O404 *11 #8
## 6 0.9721 O411 *12 #8
## 6 0.9721 O413 *13 #8
## 6 0.9721 O414 *8 #9
## 6 0.9721 O421 *14 #8
## 6 0.9721 O431 *15 #8
## 6 0.9721 O432 *8 #8
## 6 0.9721 O434 *15 #10
## 6 0.9721 O442 *16 #9
## 6 0.9721 O444 *18 #9
## 6 0.9721 O445 *18 #9
## 6 0.9721 O451 *18 #12
## 6 0.9721 O453 *16 #12
## 7 0.1504 O443 *17 #11
## 7 0.1504 O452 *17 #13
## 7 0.1504 O472 *17 #17
## 7 0.1504 O473 *17 #18
## 7 0.1504 O475 *17 #18
## 7 0.1504 O476 *17 #17
## 7 0.1504 O477 *17 #20
## 7 0.1504 O521 *26 #13
## 7 0.1504 O554 *26 #38
## 8 0.8628 O461 *19 #14
## 8 0.8628 O462 *19 #15
## 8 0.8628 O471 *19 #16
## 8 0.8628 O474 *19 #19
## 9 0.8967 O481 *20 #21
## 9 0.8967 O492 *20 #27
## 9 0.8967 O506 *20 #28
## 9 0.8967 O532 *31 #35
## 9 0.8967 O533 *32 #35
## 9 0.8967 O551 *31 #27
## 9 0.8967 O561 *35 #35
## 9 0.8967 O563 *31 #27
## 10 0.9372 O482 *21 #22
## 10 0.9372 O488 *21 #22
## 10 0.9372 O490 *25 #22
## 10 0.9372 O491 *25 #22
## 10 0.9372 O541 *21 #22
## 11 0.7313 O529 *30 #33
(colony.clusters.larvae <- read_table("../qc-processing/colony/larvae_colony.BestCluster", col_names=TRUE) %>% clean_names() %>%
mutate_all(funs(str_replace(., "\\*|#", ""))) %>%
mutate(cluster_index=as.numeric(cluster_index), father_id=as.numeric(father_id), mother_id=as.numeric(mother_id), probability=as.numeric(probability)) %>%
mutate(offspring_id=str_replace(offspring_id, "O", "")) %>% as.data.frame() %>% left_join(sample.info.larvae, by = c("offspring_id"="sample")))
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## ClusterIndex = col_double(),
## Probability = col_double(),
## OffspringID = col_character(),
## FatherID = col_character(),
## MotherID = col_character()
## )
## cluster_index probability offspring_id father_id mother_id stage
## 1 1 0.6778 034 1 1 larvae
## 2 1 0.6778 037 3 1 larvae
## 3 1 0.6778 044 3 1 larvae
## 4 1 0.6778 046 7 1 larvae
## 5 1 0.6778 047 7 7 larvae
## 6 1 0.6778 483 3 23 larvae
## 7 1 0.6778 485 23 25 larvae
## 8 1 0.6778 487 23 25 larvae
## 9 1 0.6778 526 7 25 larvae
## 10 1 0.6778 528 23 32 larvae
## 11 2 1.0000 035 2 2 larvae
## 12 2 1.0000 412 2 2 larvae
## 13 2 1.0000 441 2 2 larvae
## 14 2 1.0000 565 2 2 larvae
## 15 3 0.1356 039 4 3 larvae
## 16 3 0.1356 484 22 24 larvae
## 17 3 0.1356 513 4 24 larvae
## 18 3 0.1356 531 22 34 larvae
## 19 3 0.1356 542 33 3 larvae
## 20 3 0.1356 543 22 36 larvae
## 21 3 0.1356 552 4 37 larvae
## 22 3 0.1356 553 34 36 larvae
## 23 4 0.8784 041 5 4 larvae
## 24 4 0.8784 043 5 5 larvae
## 25 5 0.4926 045 6 6 larvae
## 26 5 0.4926 489 24 26 larvae
## 27 5 0.4926 522 27 29 larvae
## 28 5 0.4926 523 28 30 larvae
## 29 5 0.4926 524 28 31 larvae
## 30 5 0.4926 525 28 6 larvae
## 31 5 0.4926 527 29 31 larvae
## 32 5 0.4926 562 27 30 larvae
## 33 5 0.4926 564 24 30 larvae
## 34 6 0.9721 401 8 8 larvae
## 35 6 0.9721 402 9 8 larvae
## 36 6 0.9721 403 10 8 larvae
## 37 6 0.9721 404 11 8 larvae
## 38 6 0.9721 411 12 8 larvae
## 39 6 0.9721 413 13 8 larvae
## 40 6 0.9721 414 8 9 larvae
## 41 6 0.9721 421 14 8 larvae
## 42 6 0.9721 431 15 8 larvae
## 43 6 0.9721 432 8 8 larvae
## 44 6 0.9721 434 15 10 larvae
## 45 6 0.9721 442 16 9 larvae
## 46 6 0.9721 444 18 9 larvae
## 47 6 0.9721 445 18 9 larvae
## 48 6 0.9721 451 18 12 larvae
## 49 6 0.9721 453 16 12 larvae
## 50 7 0.1504 443 17 11 larvae
## 51 7 0.1504 452 17 13 larvae
## 52 7 0.1504 472 17 17 larvae
## 53 7 0.1504 473 17 18 larvae
## 54 7 0.1504 475 17 18 larvae
## 55 7 0.1504 476 17 17 larvae
## 56 7 0.1504 477 17 20 larvae
## 57 7 0.1504 521 26 13 larvae
## 58 7 0.1504 554 26 38 larvae
## 59 8 0.8628 461 19 14 larvae
## 60 8 0.8628 462 19 15 larvae
## 61 8 0.8628 471 19 16 larvae
## 62 8 0.8628 474 19 19 larvae
## 63 9 0.8967 481 20 21 larvae
## 64 9 0.8967 492 20 27 larvae
## 65 9 0.8967 506 20 28 larvae
## 66 9 0.8967 532 31 35 larvae
## 67 9 0.8967 533 32 35 larvae
## 68 9 0.8967 551 31 27 larvae
## 69 9 0.8967 561 35 35 larvae
## 70 9 0.8967 563 31 27 larvae
## 71 10 0.9372 482 21 22 larvae
## 72 10 0.9372 488 21 22 larvae
## 73 10 0.9372 490 25 22 larvae
## 74 10 0.9372 491 25 22 larvae
## 75 10 0.9372 541 21 22 larvae
## 76 11 0.7313 529 30 33 larvae
## population pCO2.parent pop_code
## 1 Oyster Bay C1 Ambient Oyster Bay C1-Ambient
## 2 Oyster Bay C1 Ambient Oyster Bay C1-Ambient
## 3 Oyster Bay C1 High Oyster Bay C1-High
## 4 Oyster Bay C1 High Oyster Bay C1-High
## 5 Oyster Bay C1 High Oyster Bay C1-High
## 6 Oyster Bay C1 Ambient Oyster Bay C1-Ambient
## 7 Oyster Bay C1 Ambient Oyster Bay C1-Ambient
## 8 Oyster Bay C1 Ambient Oyster Bay C1-Ambient
## 9 Oyster Bay C1 High Oyster Bay C1-High
## 10 Oyster Bay C1 High Oyster Bay C1-High
## 11 Oyster Bay C1 Ambient Oyster Bay C1-Ambient
## 12 Dabob Bay High Dabob Bay-High
## 13 Fidalgo Bay Ambient Fidalgo Bay-Ambient
## 14 Oyster Bay C2 High Oyster Bay C2-High
## 15 Oyster Bay C1 Ambient Oyster Bay C1-Ambient
## 16 Oyster Bay C1 Ambient Oyster Bay C1-Ambient
## 17 Oyster Bay C1 Ambient Oyster Bay C1-Ambient
## 18 Oyster Bay C2 Ambient Oyster Bay C2-Ambient
## 19 Oyster Bay C2 High Oyster Bay C2-High
## 20 Oyster Bay C2 High Oyster Bay C2-High
## 21 Oyster Bay C2 Ambient Oyster Bay C2-Ambient
## 22 Oyster Bay C2 Ambient Oyster Bay C2-Ambient
## 23 Oyster Bay C1 High Oyster Bay C1-High
## 24 Oyster Bay C1 High Oyster Bay C1-High
## 25 Oyster Bay C1 High Oyster Bay C1-High
## 26 Oyster Bay C1 Ambient Oyster Bay C1-Ambient
## 27 Oyster Bay C1 High Oyster Bay C1-High
## 28 Oyster Bay C1 High Oyster Bay C1-High
## 29 Oyster Bay C1 High Oyster Bay C1-High
## 30 Oyster Bay C1 High Oyster Bay C1-High
## 31 Oyster Bay C1 High Oyster Bay C1-High
## 32 Oyster Bay C2 High Oyster Bay C2-High
## 33 Oyster Bay C2 High Oyster Bay C2-High
## 34 Dabob Bay Ambient Dabob Bay-Ambient
## 35 Dabob Bay Ambient Dabob Bay-Ambient
## 36 Dabob Bay Ambient Dabob Bay-Ambient
## 37 Dabob Bay Ambient Dabob Bay-Ambient
## 38 Dabob Bay High Dabob Bay-High
## 39 Dabob Bay High Dabob Bay-High
## 40 Dabob Bay High Dabob Bay-High
## 41 Dabob Bay Ambient Dabob Bay-Ambient
## 42 Dabob Bay High Dabob Bay-High
## 43 Dabob Bay High Dabob Bay-High
## 44 Dabob Bay High Dabob Bay-High
## 45 Fidalgo Bay Ambient Fidalgo Bay-Ambient
## 46 Fidalgo Bay Ambient Fidalgo Bay-Ambient
## 47 Fidalgo Bay Ambient Fidalgo Bay-Ambient
## 48 Fidalgo Bay High Fidalgo Bay-High
## 49 Fidalgo Bay High Fidalgo Bay-High
## 50 Fidalgo Bay Ambient Fidalgo Bay-Ambient
## 51 Fidalgo Bay High Fidalgo Bay-High
## 52 Fidalgo Bay High Fidalgo Bay-High
## 53 Fidalgo Bay High Fidalgo Bay-High
## 54 Fidalgo Bay High Fidalgo Bay-High
## 55 Fidalgo Bay High Fidalgo Bay-High
## 56 Fidalgo Bay High Fidalgo Bay-High
## 57 Oyster Bay C1 High Oyster Bay C1-High
## 58 Oyster Bay C2 Ambient Oyster Bay C2-Ambient
## 59 Fidalgo Bay Ambient Fidalgo Bay-Ambient
## 60 Fidalgo Bay Ambient Fidalgo Bay-Ambient
## 61 Fidalgo Bay High Fidalgo Bay-High
## 62 Fidalgo Bay High Fidalgo Bay-High
## 63 Oyster Bay C1 Ambient Oyster Bay C1-Ambient
## 64 Oyster Bay C1 Ambient Oyster Bay C1-Ambient
## 65 Oyster Bay C1 High Oyster Bay C1-High
## 66 Oyster Bay C2 Ambient Oyster Bay C2-Ambient
## 67 Oyster Bay C2 Ambient Oyster Bay C2-Ambient
## 68 Oyster Bay C2 Ambient Oyster Bay C2-Ambient
## 69 Oyster Bay C2 High Oyster Bay C2-High
## 70 Oyster Bay C2 High Oyster Bay C2-High
## 71 Oyster Bay C1 Ambient Oyster Bay C1-Ambient
## 72 Oyster Bay C1 Ambient Oyster Bay C1-Ambient
## 73 Oyster Bay C1 Ambient Oyster Bay C1-Ambient
## 74 Oyster Bay C1 Ambient Oyster Bay C1-Ambient
## 75 Oyster Bay C2 High Oyster Bay C2-High
## 76 Oyster Bay C1 High Oyster Bay C1-High
save(colony.clusters.larvae, file = "../qc-processing/colony/best-cluster-larvae")
# Plot mother ~ father ID
ggplotly(
ggplot(colony.clusters.larvae, aes(x=father_id, y=mother_id, size=probability, colour=population, shape=pCO2.parent)) + geom_jitter() + theme_minimal() + scale_shape_manual(values=c(1,2)) + scale_size_continuous(guide="none"))
# How many fathers and mothers does each population + pCO2 group have?
colony.clusters.larvae %>%
group_by(population, pCO2.parent) %>%
summarise(n_fathers=n_distinct(father_id),n_mothers=n_distinct(mother_id)) %>%
pivot_longer(names_to = "parent", values_to="n_parents", cols = c(n_fathers, n_mothers)) %>%
mutate(population=as.factor(population),pCO2.parent=as.factor(pCO2.parent),parent=as.factor(parent)) %>%
ggplot(aes(fill=population, alpha=pCO2.parent, x=population, y=n_parents)) + geom_bar(position="dodge", stat="identity") +
scale_alpha_discrete(range = c(0.35, 0.8)) + ylab("No. of parents") +
facet_wrap(~parent) + theme_minimal() + theme(axis.text.x = element_blank()) + xlab("Population & parental pCO2 exposure")
## `summarise()` has grouped output by 'population'. You can override using the `.groups` argument.
## Warning: Using alpha for a discrete variable is not advised.
snpset.juvenile.4Colony <- snpgdsLDpruning(genofile.juvenile, maf=.4, missing.rate=0, autosome.only=FALSE)
## SNP pruning based on LD:
## Excluding 48,939 SNPs (monomorphic: TRUE, MAF: 0.4, missing rate: 0)
## # of samples: 16
## # of SNPs: 1,669
## using 1 thread
## sliding window: 500,000 basepairs, Inf SNPs
## |LD| threshold: 0.2
## method: composite
## Chromosome Contig0_5313: 1.78%, 102/5,731
## Chromosome Contig104153_109739: 1.60%, 12/750
## Chromosome Contig109741_116186: 2.09%, 14/670
## Chromosome Contig11179_21262: 1.73%, 78/4,498
## Chromosome Contig116188_123130: 2.41%, 14/580
## Chromosome Contig123131_129982: 1.42%, 6/422
## Chromosome Contig129983_136896: 1.49%, 8/536
## Chromosome Contig136897_143777: 1.95%, 13/667
## Chromosome Contig143780_150710: 2.06%, 10/486
## Chromosome Contig150711_166372: 2.20%, 10/455
## Chromosome Contig166374_181896: 2.36%, 13/550
## Chromosome Contig181898_197428: 2.36%, 10/423
## Chromosome Contig197431_213218: 2.13%, 12/564
## Chromosome Contig21263_28135: 1.94%, 97/4,994
## Chromosome Contig213221_398012: 2.16%, 11/510
## Chromosome Contig28136_34745: 1.68%, 80/4,772
## Chromosome Contig34748_41067: 1.83%, 70/3,817
## Chromosome Contig398123_696856: 1.89%, 9/476
## Chromosome Contig41068_47193: 1.72%, 55/3,201
## Chromosome Contig47194_53180: 2.16%, 61/2,819
## Chromosome Contig5314_11178: 1.77%, 25/1,414
## Chromosome Contig53181_59083: 1.89%, 48/2,534
## Chromosome Contig59084_64831: 2.01%, 36/1,792
## Chromosome Contig64832_70524: 1.93%, 32/1,654
## Chromosome Contig70525_76135: 2.10%, 28/1,331
## Chromosome Contig76136_81765: 1.81%, 24/1,329
## Chromosome Contig81766_87352: 2.08%, 25/1,203
## Chromosome Contig87353_92929: 1.50%, 13/866
## Chromosome Contig92931_98526: 2.15%, 17/791
## Chromosome Contig98527_104152: 2.07%, 16/773
## 949 markers are selected in total.
snpset.juvenile.id.4Colony <- unlist(unname(snpset.juvenile.4Colony))
snpgdsCreateGenoSet(src.fn="../qc-processing/gatk/genotypes-juvenile.gds",
dest.fn="../qc-processing/colony/genotypes-juvenile-4Colony.gds",
snp.id=snpset.juvenile.id.4Colony)
## Create a GDS genotype file:
## The new dataset consists of 16 samples and 949 SNPs
## write sample.id
## write snp.id
## write snp.rs.id
## write snp.position
## write snp.chromosome
## write snp.allele
## SNP genotypes are stored in SNP-major mode (Sample X SNP).
# Extract genotype matrix
geno.matrix4Colony.juvenile <- snpgdsGetGeno(gdsobj="../qc-processing/colony/genotypes-juvenile-4Colony.gds", with.id=TRUE)
## Genotype matrix: 16 samples X 949 SNPs
paste("No. of juvenile SNPs for Colony sibship analysis: ", ncol(geno.matrix4Colony.juvenile$genotype), sep="")
## [1] "No. of juvenile SNPs for Colony sibship analysis: 949"
paste("No. of juvenile samples for Colony sibship analysis: ", nrow(geno.matrix4Colony.juvenile$genotype), sep="")
## [1] "No. of juvenile samples for Colony sibship analysis: 16"
# Replace SNPRelate numeric allele coding with character allele coding
a <- geno.matrix4Colony.juvenile$genotype %>% as_data_frame() %>%
mutate_all(funs(str_replace(., "0", "BB"))) %>%
mutate_all(funs(str_replace(., "1", "AB"))) %>%
mutate_all(funs(str_replace(., "2", "AA")))
# Duplicate each column. Now there's 2 columns per locus, both containing the same bi-allelic genotype info.
b <- a[rep(names(a), rep(2, times=ncol(a)))] %>% clean_names()
ncol(b) == ncol(a)*2 #should be TRUE
## [1] TRUE
# For each locus, extract the first allele in column 1, and the second allele in column 2.
c <- b #duplicate dataframe
odds <- seq(from = 1, to = ncol(c), by = 2)
evens <- seq(from = 2, to = ncol(c), by = 2)
odds.names <- colnames(b[odds])
evens.names <- colnames(b[evens])
for (i in 1:length(odds.names)) {
c[[odds.names[i]]] <- substring(b[[odds.names[i]]], first = 1, last = 1)
}
for (i in 1:length(evens.names)) {
c[[evens.names[i]]] <- substring(b[[evens.names[i]]], first = 2, last = 2)
}
# Now replace the character allele information with Colony's way of numeric coding.
# There are 2 columns for each locus (e.g. v1 and v1_2), and only 3 options for each allele:
# 1= B allele
# 2= A allele
# 0= missing allele
(geno.matrix4Colony.juvenile.recode <- c %>%
mutate_all(funs(str_replace(., "B", "1"))) %>%
mutate_all(funs(str_replace(., "A", "2"))) %>%
mutate(., across(everything(), ~replace_na(.x, 0))))
## # A tibble: 16 x 1,898
## v1 v1_2 v2 v2_2 v3 v3_2 v4 v4_2 v5 v5_2 v6 v6_2 v7
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 2 1 2 1 2 2 2 1 1 1 2 1 2
## 2 2 1 1 1 1 1 2 2 1 1 2 2 2
## 3 1 1 2 2 2 2 2 1 2 1 1 1 2
## 4 2 1 2 1 1 1 2 1 2 1 2 1 2
## 5 2 2 2 2 1 1 2 2 2 2 1 1 1
## 6 2 1 2 2 2 2 2 2 2 1 2 2 2
## 7 2 2 2 1 2 1 2 1 2 1 2 1 2
## 8 2 1 2 1 2 1 2 1 2 2 2 1 1
## 9 1 1 2 2 2 2 2 1 2 1 2 1 2
## 10 2 2 1 1 2 2 2 1 2 1 2 2 2
## 11 2 1 1 1 2 1 2 1 2 1 2 2 2
## 12 2 1 1 1 2 2 2 1 1 1 2 1 2
## 13 2 1 2 1 2 1 2 1 2 2 2 1 1
## 14 2 2 2 1 2 1 2 1 2 2 2 1 1
## 15 2 1 2 2 1 1 2 1 2 2 2 1 1
## 16 2 2 2 1 2 1 1 1 2 1 2 1 1
## # … with 1,885 more variables: v7_2 <chr>, v8 <chr>, v8_2 <chr>, v9 <chr>,
## # v9_2 <chr>, v10 <chr>, v10_2 <chr>, v11 <chr>, v11_2 <chr>, v12 <chr>,
## # v12_2 <chr>, v13 <chr>, v13_2 <chr>, v14 <chr>, v14_2 <chr>, v15 <chr>,
## # v15_2 <chr>, v16 <chr>, v16_2 <chr>, v17 <chr>, v17_2 <chr>, v18 <chr>,
## # v18_2 <chr>, v19 <chr>, v19_2 <chr>, v20 <chr>, v20_2 <chr>, v21 <chr>,
## # v21_2 <chr>, v22 <chr>, v22_2 <chr>, v23 <chr>, v23_2 <chr>, v24 <chr>,
## # v24_2 <chr>, v25 <chr>, v25_2 <chr>, v26 <chr>, v26_2 <chr>, v27 <chr>,
## # v27_2 <chr>, v28 <chr>, v28_2 <chr>, v29 <chr>, v29_2 <chr>, v30 <chr>,
## # v30_2 <chr>, v31 <chr>, v31_2 <chr>, v32 <chr>, v32_2 <chr>, v33 <chr>,
## # v33_2 <chr>, v34 <chr>, v34_2 <chr>, v35 <chr>, v35_2 <chr>, v36 <chr>,
## # v36_2 <chr>, v37 <chr>, v37_2 <chr>, v38 <chr>, v38_2 <chr>, v39 <chr>,
## # v39_2 <chr>, v40 <chr>, v40_2 <chr>, v41 <chr>, v41_2 <chr>, v42 <chr>,
## # v42_2 <chr>, v43 <chr>, v43_2 <chr>, v44 <chr>, v44_2 <chr>, v45 <chr>,
## # v45_2 <chr>, v46 <chr>, v46_2 <chr>, v47 <chr>, v47_2 <chr>, v48 <chr>,
## # v48_2 <chr>, v49 <chr>, v49_2 <chr>, v50 <chr>, v50_2 <chr>, v51 <chr>,
## # v51_2 <chr>, v52 <chr>, v52_2 <chr>, v53 <chr>, v53_2 <chr>, v54 <chr>,
## # v54_2 <chr>, v55 <chr>, v55_2 <chr>, v56 <chr>, v56_2 <chr>, v57 <chr>, …
# Save genotype matrix to file, while also adding a column with sample number
write_delim(cbind(sampleID=paste("O", geno.matrix4Colony.juvenile$sample.id, sep=""),
geno.matrix4Colony.juvenile.recode),
file="../qc-processing/colony/geno.matrix.juvenile.tab", delim=" ", col_names=FALSE)
I created a text file that will be the input for Colony. I used Colony’s example input file as template and edited by hand. For the genotype matrix I selected all contents in the “geno.matrix.juvenile.tab” file and pasted it into the text file.
Here’s a snapshot of the input file, truncating the genotype info (there are 16 samples, but here I only show 1)
head -n 28 ../qc-processing/colony/colony2_juveniles.dat
tail -n 12 ../qc-processing/colony/colony2_juveniles.dat
## 'juvenile_snps' !Dataset name
## 'juvenile_colony' !Output file name
## 16 ! Number of offspring in the sample
## 950 ! Number of loci
## 100 ! Seed for random number generator
## 0 ! 0/1=Not updating/updating allele frequency
## 2 ! 2/1=Dioecious/Monoecious species
## 0 ! 0/1=No inbreeding/inbreeding
## 0 ! 0/1=Diploid species/HaploDiploid species
## 0 0 ! 0/1=Polygamy/Monogamy for males & females
## 0 ! 0/1=Clone inference =No/Yes
## 0 ! 0/1=Full sibship size scaling =No/Yes
## 0 ! 0,1,2,3=No,weak,medium,strong sibship size prior; mean paternal & meteral sibship size
## 0 ! 0/1=Unknown/Known population allele frequency
## 1 ! Number of runs
## 2 ! Length of run
## 0 ! 0/1=Monitor method by Iterate#/Time in second
## 100000 ! Monitor interval in Iterate# / in seconds
## 0 ! non-Windows version
## 1 ! Fulllikelihood
## 3 ! 1/2/3=low/medium/high Precision for Fulllikelihood
##
## V@ !Marker names
## 0@ !Marker types, 0/1 = codominant/dominant
## 0.000@ !Allelic dropout rate
## 0.0001@ !false allele rate
##
## O137 2 1 2 1 2 2 2 1 1 1 2 1 2 1 1 1 2 2 2 1 1 1 2 1 2 1 2 1 1 1 2 1 2 2 2 2 2 1 2 2 2 1 1 1 2 2 2 1 1 1 1 1 1 1 2 1 2 1 2 1 2 1 1 1 2 2 2 1 1 1 2 1 2 2 2 2 2 1 2 1 1 1 2 1 2 2 2 2 2 1 1 1 2 2 1 1 2 1 2 1 2 1 2 1 1 1 2 1 2 1 2 1 1 1 2 1 2 1 2 1 1 1 1 1 2 2 2 1 2 1 2 2 1 1 2 1 1 1 2 1 2 1 2 2 2 1 2 2 2 1 2 1 2 2 2 1 1 1 2 2 1 1 2 1 2 1 2 1 1 1 2 2 2 1 2 1 2 1 2 1 2 2 2 2 2 1 1 1 2 1 1 1 1 1 2 1 2 1 2 1 1 1 1 1 2 1 1 1 2 1 2 1 2 1 2 1 2 2 1 1 2 1 1 1 2 1 1 1 1 1 2 1 2 1 2 1 1 1 2 1 2 1 2 1 2 1 2 2 2 1 2 2 1 1 2 2 2 1 2 1 2 1 1 1 2 1 2 1 1 1 2 2 2 1 2 1 2 1 2 1 2 1 1 1 2 1 1 1 2 1 2 1 2 1 2 2 2 1 2 1 2 1 2 1 2 1 2 1 2 2 2 1 2 2 2 1 1 1 2 2 2 1 2 1 2 1 2 1 2 1 2 2 2 1 2 1 2 1 1 1 2 1 2 1 1 1 2 1 1 1 2 2 2 1 2 1 2 1 1 1 1 1 1 1 2 2 2 2 1 1 2 1 1 1 2 1 2 1 2 1 2 1 2 1 1 1 2 1 2 1 2 2 2 2 2 1 1 1 2 1 2 2 2 2 2 1 1 1 2 2 1 1 2 1 2 2 2 1 2 1 2 1 2 1 1 1 1 1 2 1 2 1 1 1 2 2 2 1 2 1 2 1 2 1 2 2 2 1 2 1 2 1 2 1 2 1 2 1 1 1 2 1 2 1 2 1 2 1 2 1 2 2 1 1 1 1 2 1 1 1 2 1 2 1 2 2 1 1 1 1 2 1 2 1 2 1 1 1 2 1 1 1 2 2 2 2 2 1 1 1 2 1 1 1 2 1 2 1 2 1 2 2 2 1 2 1 2 1 2 1 2 2 2 2 2 1 2 2 2 2 2 1 2 1 2 2 2 1 2 1 2 1 1 1 2 1 2 1 2 1 1 1 2 2 1 1 2 1 2 1 1 1 2 1 2 2 2 2 2 2 2 2 2 1 2 1 2 1 2 1 2 2 2 2 2 1 1 1 2 2 1 1 1 1 1 1 2 2 2 1 2 1 2 2 2 1 1 1 1 1 1 1 2 1 2 2 1 1 1 1 2 2 2 1 2 2 2 1 2 2 1 1 1 1 2 1 2 1 2 1 2 1 1 1 2 2 2 1 2 1 2 2 2 1 2 1 2 2 2 1 2 1 2 2 2 1 2 1 2 2 1 1 1 1 2 1 2 1 2 1 2 1 2 1 2 1 1 1 2 2 2 2 2 2 2 1 1 1 2 1 2 1 2 1 2 2 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 2 1 1 2 1 2 2 2 1 2 1 2 1 2 2 2 1 1 1 1 1 2 1 1 1 1 1 1 1 2 1 2 1 2 2 1 1 2 2 2 1 2 2 2 1 2 1 2 1 2 2 1 1 1 1 2 2 2 1 2 1 2 2 2 2 1 1 2 1 2 1 2 1 2 2 2 2 1 1 2 1 2 1 2 2 1 1 2 2 2 1 2 2 2 1 2 1 1 1 2 2 2 1 2 1 2 1 2 2 1 1 1 1 2 2 2 1 2 1 1 1 2 1 2 1 1 1 2 1 2 1 2 2 2 1 2 1 2 1 2 1 2 1 2 1 2 1 1 1 2 1 2 1 2 2 1 1 2 2 2 1 2 1 2 2 1 1 2 1 2 2 2 1 2 1 2 1 2 1 2 1 2 2 1 1 2 2 2 1 2 1 1 1 2 2 1 1 2 1 2 1 2 1 2 1 2 2 2 2 2 2 2 1 2 2 2 1 2 1 2 2 1 1 1 1 2 1 1 1 2 1 1 1 2 1 2 1 2 1 1 1 2 2 2 1 2 1 2 1 2 2 2 2 2 1 2 1 2 1 2 2 2 1 2 2 2 2 2 1 2 2 2 1 2 1 2 1 2 1 1 1 2 2 2 2 2 1 2 1 1 1 2 1 2 2 1 1 2 1 2 2 2 2 2 2 1 1 2 2 2 1 1 1 2 1 1 1 1 1 2 2 2 1 1 1 2 1 1 1 2 1 2 2 2 1 2 1 2 1 2 1 2 2 2 1 2 2 1 1 2 1 1 1 1 1 1 1 2 1 1 1 2 1 2 2 1 1 2 2 2 1 2 1 2 1 1 1 2 1 2 1 2 1 2 1 1 1 2 1 2 1 2 2 2 1 2 2 2 1 2 1 2 1 2 1 2 1 1 1 2 1 2 1 2 1 1 1 2 2 2 1 2 1 2 2 2 1 2 1 2 1 2 1 2 1 2 1 1 1 2 2 2 1 2 2 2 1 2 2 2 2 2 1 2 1 1 1 2 2 2 1 1 1 2 1 2 1 2 2 1 1 2 2 2 1 1 1 2 1 1 1 2 2 2 1 1 1 2 2 1 1 2 1 1 1 2 2 1 1 2 1 1 1 2 1 2 2 1 1 2 1 1 1 2 1 1 1 2 2 2 2 2 1 1 1 2 2 2 1 2 2 1 1 2 2 2 1 2 1 2 2 1 1 2 1 1 1 2 1 2 1 2 1 1 1 2 1 2 1 2 2 2 2 1 1 2 1 2 2 2 1 2 1 2 1 1 1 2 2 2 1 2 1 1 1 2 1 2 2 2 1 2 1 2 1 1 1 2 1 2 2 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 2 2 2 2 2 2 1 1 1 1 1 2 1 1 1 2 2 1 1 2 1 2 1 2 1 2 1 1 1 2 1 2 1 2 1 1 1 2 2 2 1 2 1 2 2 1 1 1 1 1 1 2 1 1 1 2 1 2 2 2 2 1 1 2 1 2 1 2 1 2 2 2 2 2 1 1 1 1 1 1 1 1 1 2 1 2 2 2 1 1 1 2 2 2 2 2 2 2 2 2 1 2 1 2 1 2 1 2 1 2 2 1 1 2 1 1 1 1 1 2 1 2 2 2 1 1 1 1 1 2 2 1 1 2 1 2 1 2 2 2 1 2 2 2 1 2 1 2 1 2 1 1 1 2 1 2 2 2 2 1 1 2 1 2 1 2 2 2 1 2 1 1 1 1 1 2 2 2 1 2 2 2 1 1 1 2 2 2 2 2 1 2 2 2 2 1 1 2 2 2 2 2 1 2 1 2 2 2 1 2 1 2 1 2 1 2 2 2 1 2 1 2 1 2 2 2 1 2 1 1 1 2 2 1 1 2 1 2 1 2 1 2 1 2 1 1 1 2 1 2 1 2 1 1 1 2 1 1 1 2 2 2 2 2 2 2 1 2 2 2 1 2 1 2 1 1 1 2 1 2 1 2 1 2 2 2 2 2 2 2 2 2 2 2 1 2 2 1 1 2 1 2 1 2 2 1 1 2 1 1 1 1 1 2 2 1 1 1 1 2 2 2 1 1 1 2 1 2 1 2 1 2 1 1 1 2 1 1 1 2 1 2 1 1 1 2 2 2 2 2 2 2 1 1 1 2 2 2 2 1 1 2 1 2 2 2 1 2 1 2 1 2 1 1 1 1 1 2 1 2 2 2 2 2 1 1 1 2 1 2 1 2 1 2 1 1 1 2 1 1 1 2 1 1 1 2 1 2 1 2 1 2 1 2 1 1 1 2 2 2 2 2 1 2 2 1 1 2 1 1 1 1 1 1 1 2 1 1 1 2 2 1 1 1 1 2 1 2 1 2 2 2 1 2 1 1 1 2 1 2 1 2 2 2 1 2 2 2 1 2 1 2 2 2 2 2 1 2 2 2 2 2 1 2 1 2 1 2 1 2 1 2 1 1 1 2 2 2 1 2 1 2 1 2 1 1 1 2 2 2 1 2 1 2 1 1 1 2 1 1 1 2 1 2 2 1 1 2 1 2 1 2 1 2 2 2 2 2 1 2 2 1 1 2 1 2 1 2 1 2 1 2 1
##
##
## 0 0 !prob. of dad/mum included in the candidates
## 0 0 !numbers of candiadte males & females
## 0 0 !#known fater-offspring dyads, paternity exclusion threshold
## 0 0 !#known moter-offspring dyads, maternity exclusion threshold
## 0 !#known paternal sibship with unknown fathers
## 0 !#known maternal sibship with unknown mothers
## 0 !#known paternity exclusions
## 0 !#known maternity exclusions
## 0 !#known paternal sibship exclusions
## 0 !#known maternal sibship exclusions
Note: I have to be in the colony directory to run it, so I change my directory in the same code chunk prior to executing the ./colony2s.out script
cd ../qc-processing/colony/
./colony2s.out IFN:colony2_juveniles.dat
cat ../qc-processing/colony/juvenile_colony.BestCluster
## ClusterIndex Probability OffspringID FatherID MotherID
## 1 0.4954 O137 *1 #1
## 1 0.4954 O139 *2 #2
## 1 0.4954 O140 *2 #1
## 1 0.4954 O141 *1 #2
## 2 0.3684 O156 *3 #3
## 2 0.3684 O183 *10 #7
## 2 0.3684 O184 *10 #8
## 2 0.3684 O185 *3 #7
## 3 0.8991 O159 *4 #4
## 3 0.8991 O161 *5 #4
## 3 0.8991 O162 *5 #4
## 4 0.8583 O168 *6 #5
## 4 0.8583 O169 *7 #5
## 4 0.8583 O171 *7 #5
## 4 0.8583 O172 *8 #5
## 5 1.0000 O181 *9 #6
(colony.clusters.juvenile <- read_table("../qc-processing/colony/juvenile_colony.BestCluster", col_names=TRUE) %>% clean_names() %>%
mutate_all(funs(str_replace(., "\\*|#", ""))) %>%
mutate(cluster_index=as.numeric(cluster_index), father_id=as.numeric(father_id), mother_id=as.numeric(mother_id), probability=as.numeric(probability)) %>%
mutate(offspring_id=str_replace(offspring_id, "O", "")) %>% as.data.frame() %>% left_join(sample.info.juvenile, by = c("offspring_id"="sample")))
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## ClusterIndex = col_double(),
## Probability = col_double(),
## OffspringID = col_character(),
## FatherID = col_character(),
## MotherID = col_character()
## )
## cluster_index probability offspring_id father_id mother_id stage
## 1 1 0.4954 137 1 1 juvenile
## 2 1 0.4954 139 2 2 juvenile
## 3 1 0.4954 140 2 1 juvenile
## 4 1 0.4954 141 1 2 juvenile
## 5 2 0.3684 156 3 3 juvenile
## 6 2 0.3684 183 10 7 juvenile
## 7 2 0.3684 184 10 8 juvenile
## 8 2 0.3684 185 3 7 juvenile
## 9 3 0.8991 159 4 4 juvenile
## 10 3 0.8991 161 5 4 juvenile
## 11 3 0.8991 162 5 4 juvenile
## 12 4 0.8583 168 6 5 juvenile
## 13 4 0.8583 169 7 5 juvenile
## 14 4 0.8583 171 7 5 juvenile
## 15 4 0.8583 172 8 5 juvenile
## 16 5 1.0000 181 9 6 juvenile
## population pCO2.parent pop_code
## 1 Dabob Bay Ambient Dabob Bay-Ambient
## 2 Dabob Bay Ambient Dabob Bay-Ambient
## 3 Dabob Bay Ambient Dabob Bay-Ambient
## 4 Dabob Bay Ambient Dabob Bay-Ambient
## 5 Fidalgo Bay Ambient Fidalgo Bay-Ambient
## 6 Fidalgo Bay High Fidalgo Bay-High
## 7 Fidalgo Bay High Fidalgo Bay-High
## 8 Fidalgo Bay High Fidalgo Bay-High
## 9 Fidalgo Bay Ambient Fidalgo Bay-Ambient
## 10 Fidalgo Bay Ambient Fidalgo Bay-Ambient
## 11 Fidalgo Bay Ambient Fidalgo Bay-Ambient
## 12 Dabob Bay High Dabob Bay-High
## 13 Dabob Bay High Dabob Bay-High
## 14 Dabob Bay High Dabob Bay-High
## 15 Dabob Bay High Dabob Bay-High
## 16 Fidalgo Bay High Fidalgo Bay-High
save(colony.clusters.juvenile, file = "../qc-processing/colony/best-cluster-juvenile")
# Plot mother ~ father ID
ggplotly(
ggplot(colony.clusters.juvenile, aes(x=father_id, y=mother_id, size=probability, colour=population, shape=pCO2.parent)) + geom_jitter() + theme_minimal() + scale_shape_manual(values=c(1,2)) + scale_size_continuous(guide="none"))
# How many fathers and mothers does each population + pCO2 group have?
colony.clusters.juvenile %>%
group_by(population, pCO2.parent) %>%
summarise(n_fathers=n_distinct(father_id),n_mothers=n_distinct(mother_id)) %>%
pivot_longer(names_to = "parent", values_to="n_parents", cols = c(n_fathers, n_mothers)) %>%
mutate(population=as.factor(population),pCO2.parent=as.factor(pCO2.parent),parent=as.factor(parent)) %>%
ggplot(aes(fill=population, alpha=pCO2.parent, x=population, y=n_parents)) + geom_bar(position="dodge", stat="identity") +
scale_alpha_discrete(range = c(0.35, 0.8)) + ylab("No. of parents") +
facet_wrap(~parent) + theme_minimal() + theme(axis.text.x = element_blank()) + xlab("Population & parental pCO2 exposure")
## `summarise()` has grouped output by 'population'. You can override using the `.groups` argument.
## Warning: Using alpha for a discrete variable is not advised.